diff --git a/docs/_images/algorithms.png b/docs/_images/algorithms.png index 6c00f21..0849ad7 100644 Binary files a/docs/_images/algorithms.png and b/docs/_images/algorithms.png differ diff --git a/docs/_images/wolpertinger.png b/docs/_images/wolpertinger.png new file mode 100644 index 0000000..e7f9b37 Binary files /dev/null and b/docs/_images/wolpertinger.png differ diff --git a/docs/_modules/index.html b/docs/_modules/index.html index b774d7e..5654895 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -202,6 +202,7 @@
  • rl_coach.agents.soft_actor_critic_agent
  • rl_coach.agents.td3_agent
  • rl_coach.agents.value_optimization_agent
  • +
  • rl_coach.agents.wolpertinger_agent
  • rl_coach.architectures.architecture
  • rl_coach.architectures.network_wrapper
  • rl_coach.base_parameters
  • diff --git a/docs/_modules/rl_coach/agents/agent.html b/docs/_modules/rl_coach/agents/agent.html index 49d4a8a..0f566aa 100644 --- a/docs/_modules/rl_coach/agents/agent.html +++ b/docs/_modules/rl_coach/agents/agent.html @@ -756,6 +756,9 @@ if self.phase != RunPhase.TEST: if isinstance(self.memory, EpisodicExperienceReplay): + if self.ap.algorithm.override_episode_rewards_with_the_last_transition_reward: + for t in self.current_episode_buffer.transitions: + t.reward = self.current_episode_buffer.transitions[-1].reward self.call_memory('store_episode', self.current_episode_buffer) elif self.ap.algorithm.store_transitions_only_when_episodes_are_terminated: for transition in self.current_episode_buffer.transitions: @@ -910,7 +913,8 @@ # update counters self.training_iteration += 1 if self.pre_network_filter is not None: - batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False) + update_internal_state = self.ap.algorithm.update_pre_network_filters_state_on_train + batch = self.pre_network_filter.filter(batch, update_internal_state=update_internal_state, deep_copy=False) # if the batch returned empty then there are not enough samples in the replay buffer -> skip # training step @@ -1020,7 +1024,8 @@ # informed action if self.pre_network_filter is not None: # before choosing an action, first use the pre_network_filter to filter out the current state - update_filter_internal_state = self.phase is not RunPhase.TEST + update_filter_internal_state = self.ap.algorithm.update_pre_network_filters_state_on_inference and \ + self.phase is not RunPhase.TEST curr_state = self.run_pre_network_filter_for_inference(self.curr_state, update_filter_internal_state) else: @@ -1048,6 +1053,10 @@ :return: The filtered state """ dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False) + + # TODO actually we only want to run the observation filters. No point in running the reward filters as the + # filtered reward is being ignored anyway (and it might unncecessarily affect the reward filters' internal + # state). return self.pre_network_filter.filter(dummy_env_response, update_internal_state=update_filter_internal_state)[0].next_state @@ -1177,7 +1186,7 @@ """ Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the - slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent + slave agent, define its observation, possible actions, etc. The directive type is defined by the agent in-action-space. :param action: The action that should be set as the directive diff --git a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html index 9a3f3b1..ba808c2 100644 --- a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html +++ b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html @@ -295,7 +295,9 @@ self.optimization_epochs = 10 self.normalization_stats = None self.clipping_decay_schedule = ConstantSchedule(1) - self.act_for_full_episodes = True + self.act_for_full_episodes = True + self.update_pre_network_filters_state_on_train = True + self.update_pre_network_filters_state_on_inference = False class ClippedPPOAgentParameters(AgentParameters): @@ -486,7 +488,9 @@ network.set_is_training(True) dataset = self.memory.transitions - dataset = self.pre_network_filter.filter(dataset, deep_copy=False) + update_internal_state = self.ap.algorithm.update_pre_network_filters_state_on_train + dataset = self.pre_network_filter.filter(dataset, deep_copy=False, + update_internal_state=update_internal_state) batch = Batch(dataset) for training_step in range(self.ap.algorithm.num_consecutive_training_steps): @@ -512,7 +516,9 @@ def run_pre_network_filter_for_inference(self, state: StateType, update_internal_state: bool=False): dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False) - return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state + update_internal_state = self.ap.algorithm.update_pre_network_filters_state_on_inference + return self.pre_network_filter.filter( + dummy_env_response, update_internal_state=update_internal_state)[0].next_state def choose_action(self, curr_state): self.ap.algorithm.clipping_decay_schedule.step() diff --git a/docs/_modules/rl_coach/agents/wolpertinger_agent.html b/docs/_modules/rl_coach/agents/wolpertinger_agent.html new file mode 100644 index 0000000..67dd6cd --- /dev/null +++ b/docs/_modules/rl_coach/agents/wolpertinger_agent.html @@ -0,0 +1,356 @@ + + + + + + + + + + + rl_coach.agents.wolpertinger_agent — Reinforcement Learning Coach 0.12.0 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + +
      + +
    • Docs »
    • + +
    • Module code »
    • + +
    • rl_coach.agents.wolpertinger_agent
    • + + +
    • + +
    • + +
    + + +
    +
    +
    +
    + +

    Source code for rl_coach.agents.wolpertinger_agent

    +#
    +# Copyright (c) 2019 Intel Corporation 
    +#
    +# Licensed under the Apache License, Version 2.0 (the "License");
    +# you may not use this file except in compliance with the License.
    +# You may obtain a copy of the License at
    +#
    +#      http://www.apache.org/licenses/LICENSE-2.0
    +#
    +# Unless required by applicable law or agreed to in writing, software
    +# distributed under the License is distributed on an "AS IS" BASIS,
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    +# See the License for the specific language governing permissions and
    +# limitations under the License.
    +#
    +
    +import copy
    +from typing import Union
    +from collections import OrderedDict
    +import numpy as np
    +
    +from rl_coach.agents.ddpg_agent import DDPGAlgorithmParameters, DDPGActorNetworkParameters, \
    +    DDPGCriticNetworkParameters, DDPGAgent
    +from rl_coach.base_parameters import AgentParameters
    +from rl_coach.core_types import ActionInfo
    +from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
    +from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
    +from rl_coach.memories.non_episodic.differentiable_neural_dictionary import AnnoyDictionary
    +from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
    +from rl_coach.architectures.head_parameters import WolpertingerActorHeadParameters
    +
    +
    +class WolpertingerCriticNetworkParameters(DDPGCriticNetworkParameters):
    +    def __init__(self, use_batchnorm=False):
    +        super().__init__(use_batchnorm=use_batchnorm)
    +
    +
    +class WolpertingerActorNetworkParameters(DDPGActorNetworkParameters):
    +    def __init__(self, use_batchnorm=False):
    +        super().__init__()
    +        self.heads_parameters = [WolpertingerActorHeadParameters(batchnorm=use_batchnorm)]
    +
    +
    +
    [docs]class WolpertingerAlgorithmParameters(DDPGAlgorithmParameters): + def __init__(self): + super().__init__() + self.action_embedding_width = 1 + self.k = 1
    + + +class WolpertingerAgentParameters(AgentParameters): + def __init__(self, use_batchnorm=False): + exploration_params = AdditiveNoiseParameters() + exploration_params.noise_as_percentage_from_action_space = False + + super().__init__(algorithm=WolpertingerAlgorithmParameters(), + exploration=exploration_params, + memory=EpisodicExperienceReplayParameters(), + networks=OrderedDict( + [("actor", WolpertingerActorNetworkParameters(use_batchnorm=use_batchnorm)), + ("critic", WolpertingerCriticNetworkParameters(use_batchnorm=use_batchnorm))])) + + @property + def path(self): + return 'rl_coach.agents.wolpertinger_agent:WolpertingerAgent' + + +# Deep Reinforcement Learning in Large Discrete Action Spaces - https://arxiv.org/pdf/1512.07679.pdf +class WolpertingerAgent(DDPGAgent): + def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent'] = None): + super().__init__(agent_parameters, parent) + + def learn_from_batch(self, batch): + # replay buffer holds the actions in the discrete manner, as the agent is expected to act with discrete actions + # with the BoxDiscretization output filter. But DDPG needs to work on continuous actions, thus converting to + # continuous actions. This is actually a duplicate since this filtering is also done before applying actions on + # the environment. So might want to somehow reuse that conversion. Maybe can hold this information in the info + # dictionary of the transition. + + output_action_filter = \ + list(self.output_filter.action_filters.values())[0] + continuous_actions = [] + for action in batch.actions(): + continuous_actions.append(output_action_filter.filter(action)) + batch._actions = np.array(continuous_actions).squeeze() + + return super().learn_from_batch(batch) + + def train(self): + return super().train() + + def choose_action(self, curr_state): + if not isinstance(self.spaces.action, DiscreteActionSpace): + raise ValueError("WolpertingerAgent works only for discrete control problems") + + # convert to batch so we can run it through the network + tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor') + actor_network = self.networks['actor'].online_network + critic_network = self.networks['critic'].online_network + proto_action = actor_network.predict(tf_input_state) + proto_action = np.expand_dims(self.exploration_policy.get_action(proto_action), 0) + + nn_action_embeddings, indices, _, _ = self.knn_tree.query(keys=proto_action, k=self.ap.algorithm.k) + + # now move the actions through the critic and choose the one with the highest q value + critic_inputs = copy.copy(tf_input_state) + critic_inputs['observation'] = np.tile(critic_inputs['observation'], (self.ap.algorithm.k, 1)) + critic_inputs['action'] = nn_action_embeddings[0] + q_values = critic_network.predict(critic_inputs)[0] + action = int(indices[0][np.argmax(q_values)]) + self.action_signal.add_sample(action) + return ActionInfo(action=action, action_value=0) + + def init_environment_dependent_modules(self): + super().init_environment_dependent_modules() + self.knn_tree = self.get_initialized_knn() + + # TODO - ideally the knn should not be defined here, but somehow be defined by the user in the preset + def get_initialized_knn(self): + num_actions = len(self.spaces.action.actions) + action_max_abs_range = self.spaces.action.filtered_action_space.max_abs_range if \ + (hasattr(self.spaces.action, 'filtered_action_space') and + isinstance(self.spaces.action.filtered_action_space, BoxActionSpace)) \ + else 1.0 + keys = np.expand_dims((np.arange(num_actions) / (num_actions - 1) - 0.5) * 2, 1) * action_max_abs_range + values = np.expand_dims(np.arange(num_actions), 1) + knn_tree = AnnoyDictionary(dict_size=num_actions, key_width=self.ap.algorithm.action_embedding_width) + knn_tree.add(keys, values, force_rebuild_tree=True) + + return knn_tree + +
    + +
    + +
    + + +
    +
    + +
    + +
    + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/_modules/rl_coach/base_parameters.html b/docs/_modules/rl_coach/base_parameters.html index 60aac7f..045363d 100644 --- a/docs/_modules/rl_coach/base_parameters.html +++ b/docs/_modules/rl_coach/base_parameters.html @@ -396,6 +396,14 @@ # Support for parameter noise self.supports_parameter_noise = False + # Override, in retrospective, all the episode rewards with the last reward in the episode + # (sometimes useful for sparse, end of the episode, rewards problems) + self.override_episode_rewards_with_the_last_transition_reward = False + + # Filters - TODO consider creating a FilterParameters class and initialize the filters with it + self.update_pre_network_filters_state_on_train = False + self.update_pre_network_filters_state_on_inference = True +
    [docs]class PresetValidationParameters(Parameters): def __init__(self, diff --git a/docs/_modules/rl_coach/core_types.html b/docs/_modules/rl_coach/core_types.html index 78d9660..abf4711 100644 --- a/docs/_modules/rl_coach/core_types.html +++ b/docs/_modules/rl_coach/core_types.html @@ -298,6 +298,12 @@ def __init__(self, num_steps): super().__init__(num_steps) + def __truediv__(self, other): + if isinstance(other, EnvironmentSteps): + return math.ceil(self.num_steps / other.num_steps) + else: + super().__truediv__(self, other) + class Time(StepMethod): def __init__(self, num_steps): diff --git a/docs/_modules/rl_coach/data_stores/nfs_data_store.html b/docs/_modules/rl_coach/data_stores/nfs_data_store.html index a67fe7a..25785f7 100644 --- a/docs/_modules/rl_coach/data_stores/nfs_data_store.html +++ b/docs/_modules/rl_coach/data_stores/nfs_data_store.html @@ -200,15 +200,17 @@ import uuid -from rl_coach.data_stores.data_store import DataStore, DataStoreParameters +from rl_coach.data_stores.data_store import DataStoreParameters +from rl_coach.data_stores.checkpoint_data_store import CheckpointDataStore class NFSDataStoreParameters(DataStoreParameters): - def __init__(self, ds_params, deployed=False, server=None, path=None): + def __init__(self, ds_params, deployed=False, server=None, path=None, checkpoint_dir: str=""): super().__init__(ds_params.store_type, ds_params.orchestrator_type, ds_params.orchestrator_params) self.namespace = "default" if "namespace" in ds_params.orchestrator_params: self.namespace = ds_params.orchestrator_params["namespace"] + self.checkpoint_dir = checkpoint_dir self.name = None self.pvc_name = None self.pv_name = None @@ -221,7 +223,7 @@ self.path = path -
    [docs]class NFSDataStore(DataStore): +
    [docs]class NFSDataStore(CheckpointDataStore): """ An implementation of data store which uses NFS for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker. diff --git a/docs/_modules/rl_coach/data_stores/s3_data_store.html b/docs/_modules/rl_coach/data_stores/s3_data_store.html index 64fc0b9..c2e4b9b 100644 --- a/docs/_modules/rl_coach/data_stores/s3_data_store.html +++ b/docs/_modules/rl_coach/data_stores/s3_data_store.html @@ -198,7 +198,8 @@ # -from rl_coach.data_stores.data_store import DataStore, DataStoreParameters +from rl_coach.data_stores.data_store import DataStoreParameters +from rl_coach.data_stores.checkpoint_data_store import CheckpointDataStore from minio import Minio from minio.error import ResponseError from configparser import ConfigParser, Error @@ -222,7 +223,7 @@ self.expt_dir = expt_dir -
    [docs]class S3DataStore(DataStore): +
    [docs]class S3DataStore(CheckpointDataStore): """ An implementation of the data store using S3 for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker. diff --git a/docs/_modules/rl_coach/exploration_policies/additive_noise.html b/docs/_modules/rl_coach/exploration_policies/additive_noise.html index 44e2dc3..92eb352 100644 --- a/docs/_modules/rl_coach/exploration_policies/additive_noise.html +++ b/docs/_modules/rl_coach/exploration_policies/additive_noise.html @@ -245,7 +245,9 @@ self.evaluation_noise = evaluation_noise self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space - if not isinstance(action_space, BoxActionSpace): + if not isinstance(action_space, BoxActionSpace) and \ + (hasattr(action_space, 'filtered_action_space') and not + isinstance(action_space.filtered_action_space, BoxActionSpace)): raise ValueError("Additive noise exploration works only for continuous controls." "The given action space is of type: {}".format(action_space.__class__.__name__)) diff --git a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html index f5ffd7d..faa4583 100644 --- a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html +++ b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html @@ -298,7 +298,10 @@ """ :param action_space: the action space used by the environment """ - assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace) + assert isinstance(action_space, BoxActionSpace) or \ + (hasattr(action_space, 'filtered_action_space') and + isinstance(action_space.filtered_action_space, BoxActionSpace)) or \ + isinstance(action_space, GoalsSpace) super().__init__(action_space)
    diff --git a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html index 7d33198..56d84e7 100644 --- a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html +++ b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html @@ -271,9 +271,6 @@ else: action_values_std = current_noise - # scale the noise to the action space range - action_values_std = current_noise * (self.action_space.high - self.action_space.low) - # extract the mean values if isinstance(action_values, list): # the action values are expected to be a list with the action mean and optionally the action stdev diff --git a/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html b/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html index 0fae9ce..4e4837b 100644 --- a/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html +++ b/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html @@ -231,7 +231,8 @@ def get_unfiltered_action_space(self, output_action_space: ActionSpace) -> DiscreteActionSpace: self.output_action_space = output_action_space - self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions) + self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions, + filtered_action_space=output_action_space) return self.input_action_space def filter(self, action: ActionType) -> ActionType: diff --git a/docs/_modules/rl_coach/memories/backend/redis.html b/docs/_modules/rl_coach/memories/backend/redis.html index c904f4d..7567160 100644 --- a/docs/_modules/rl_coach/memories/backend/redis.html +++ b/docs/_modules/rl_coach/memories/backend/redis.html @@ -261,11 +261,18 @@ """ if 'namespace' not in self.params.orchestrator_params: self.params.orchestrator_params['namespace'] = "default" - from kubernetes import client + from kubernetes import client, config container = client.V1Container( name=self.redis_server_name, image='redis:4-alpine', + resources=client.V1ResourceRequirements( + limits={ + "cpu": "8", + "memory": "4Gi" + # "nvidia.com/gpu": "0", + } + ), ) template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels={'app': self.redis_server_name}), @@ -288,8 +295,10 @@ spec=deployment_spec ) + config.load_kube_config() api_client = client.AppsV1Api() try: + print(self.params.orchestrator_params) api_client.create_namespaced_deployment(self.params.orchestrator_params['namespace'], deployment) except client.rest.ApiException as e: print("Got exception: %s\n while creating redis-server", e) diff --git a/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html b/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html index 2a5d1f9..063e0e4 100644 --- a/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html +++ b/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html @@ -240,7 +240,7 @@ self.built_capacity = 0 - def add(self, keys, values, additional_data=None): + def add(self, keys, values, additional_data=None, force_rebuild_tree=False): if not additional_data: additional_data = [None] * len(keys) @@ -279,7 +279,7 @@ if len(self.buffered_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, int(self.curr_size * 0.02)) self._rebuild_index() - elif self.rebuild_on_every_update: + elif force_rebuild_tree or self.rebuild_on_every_update: self._rebuild_index() self.current_timestamp += 1 diff --git a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html index b8c99db..71d144f 100644 --- a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html +++ b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html @@ -307,6 +307,11 @@ """ self.memory_backend.deploy() + + if self.params.data_store_params.store_type == "redis": + self.data_store.params.redis_address = self.memory_backend.params.redis_address + self.data_store.params.redis_port = self.memory_backend.params.redis_port + if not self.data_store.deploy(): return False if self.params.data_store_params.store_type == "nfs": @@ -329,6 +334,8 @@ trainer_params.command += ['--data_store_params', json.dumps(self.params.data_store_params.__dict__)] name = "{}-{}".format(trainer_params.run_type, uuid.uuid4()) + # TODO: instead of defining each container and template spec from scratch, loaded default + # configuration and modify them as necessary depending on the store type if self.params.data_store_params.store_type == "nfs": container = k8sclient.V1Container( name=name, @@ -354,7 +361,7 @@ restart_policy='Never' ), ) - else: + elif self.params.data_store_params.store_type == "s3": container = k8sclient.V1Container( name=name, image=trainer_params.image, @@ -373,6 +380,34 @@ restart_policy='Never' ), ) + elif self.params.data_store_params.store_type == "redis": + container = k8sclient.V1Container( + name=name, + image=trainer_params.image, + command=trainer_params.command, + args=trainer_params.arguments, + image_pull_policy='Always', + stdin=True, + tty=True, + resources=k8sclient.V1ResourceRequirements( + limits={ + "cpu": "40", + "memory": "4Gi", + "nvidia.com/gpu": "1", + } + ), + ) + template = k8sclient.V1PodTemplateSpec( + metadata=k8sclient.V1ObjectMeta(labels={'app': name}), + spec=k8sclient.V1PodSpec( + containers=[container], + restart_policy='Never' + ), + ) + else: + raise ValueError("unexpected store_type {}. expected 's3', 'nfs', 'redis'".format( + self.params.data_store_params.store_type + )) job_spec = k8sclient.V1JobSpec( completions=1, @@ -404,12 +439,17 @@ if not worker_params: return False + # At this point, the memory backend and data store have been deployed and in the process, + # these parameters have been updated to include things like the hostname and port the + # service can be found at. worker_params.command += ['--memory_backend_params', json.dumps(self.params.memory_backend_parameters.__dict__)] worker_params.command += ['--data_store_params', json.dumps(self.params.data_store_params.__dict__)] worker_params.command += ['--num_workers', '{}'.format(worker_params.num_replicas)] name = "{}-{}".format(worker_params.run_type, uuid.uuid4()) + # TODO: instead of defining each container and template spec from scratch, loaded default + # configuration and modify them as necessary depending on the store type if self.params.data_store_params.store_type == "nfs": container = k8sclient.V1Container( name=name, @@ -435,7 +475,7 @@ restart_policy='Never' ), ) - else: + elif self.params.data_store_params.store_type == "s3": container = k8sclient.V1Container( name=name, image=worker_params.image, @@ -454,6 +494,32 @@ restart_policy='Never' ) ) + elif self.params.data_store_params.store_type == "redis": + container = k8sclient.V1Container( + name=name, + image=worker_params.image, + command=worker_params.command, + args=worker_params.arguments, + image_pull_policy='Always', + stdin=True, + tty=True, + resources=k8sclient.V1ResourceRequirements( + limits={ + "cpu": "8", + "memory": "4Gi", + # "nvidia.com/gpu": "0", + } + ), + ) + template = k8sclient.V1PodTemplateSpec( + metadata=k8sclient.V1ObjectMeta(labels={'app': name}), + spec=k8sclient.V1PodSpec( + containers=[container], + restart_policy='Never' + ) + ) + else: + raise ValueError('unexpected store type {}'.format(self.params.data_store_params.store_type)) job_spec = k8sclient.V1JobSpec( completions=worker_params.num_replicas, diff --git a/docs/_modules/rl_coach/spaces.html b/docs/_modules/rl_coach/spaces.html index 2e890d9..30472c3 100644 --- a/docs/_modules/rl_coach/spaces.html +++ b/docs/_modules/rl_coach/spaces.html @@ -568,7 +568,8 @@ """ A discrete action space with action indices as actions """ - def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None): + def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None, + filtered_action_space=None): super().__init__(1, low=0, high=num_actions-1, descriptions=descriptions) # the number of actions is mapped to high @@ -578,6 +579,9 @@ else: self.default_action = default_action + if filtered_action_space is not None: + self.filtered_action_space = filtered_action_space + @property def actions(self) -> List[ActionType]: return list(range(0, int(self.high[0]) + 1)) diff --git a/docs/_sources/components/agents/index.rst.txt b/docs/_sources/components/agents/index.rst.txt index ca21713..c958768 100644 --- a/docs/_sources/components/agents/index.rst.txt +++ b/docs/_sources/components/agents/index.rst.txt @@ -21,8 +21,6 @@ A detailed description of those algorithms can be found by navigating to each of imitation/cil policy_optimization/cppo policy_optimization/ddpg - policy_optimization/td3 - policy_optimization/sac other/dfp value_optimization/double_dqn value_optimization/dqn @@ -36,6 +34,10 @@ A detailed description of those algorithms can be found by navigating to each of policy_optimization/ppo value_optimization/rainbow value_optimization/qr_dqn + policy_optimization/sac + policy_optimization/td3 + policy_optimization/wolpertinger + .. autoclass:: rl_coach.base_parameters.AgentParameters diff --git a/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt b/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt new file mode 100644 index 0000000..5aa57d2 --- /dev/null +++ b/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt @@ -0,0 +1,56 @@ +Wolpertinger +============= + +**Actions space:** Discrete + +**References:** `Deep Reinforcement Learning in Large Discrete Action Spaces `_ + +Network Structure +----------------- + +.. image:: /_static/img/design_imgs/wolpertinger.png + :align: center + +Algorithm Description +--------------------- +Choosing an action +++++++++++++++++++ + +Pass the current states through the actor network, and get a proto action :math:`\mu`. +While in training phase, use a continuous exploration policy, such as the a gaussian noise, +to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid +action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the +critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen. +When testing, the same flow is used, but no exploration noise is added. + +Training the network +++++++++++++++++++++ + +Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed +through the k-NN tree. It is being passed directly to the critic. + +Start by sampling a batch of transitions from the experience replay. + +* To train the **critic network**, use the following targets: + + :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))` + + First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`. + Next, run the critic target network using the next states and :math:`\mu (s_{t+1} )`, and use the output to + calculate :math:`y_t` according to the equation above. To train the network, use the current states and actions + as the inputs, and :math:`y_t` as the targets. + +* To train the **actor network**, use the following equation: + + :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]` + + Use the actor's online network to get the action mean values using the current states as the inputs. + Then, use the critic online network in order to get the gradients of the critic output with respect to the + action mean values :math:`\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }`. + Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights, + given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network. + +After every training step, do a soft update of the critic and actor target networks' weights from the online networks. + + +.. autoclass:: rl_coach.agents.wolpertinger_agent.WolpertingerAlgorithmParameters \ No newline at end of file diff --git a/docs/components/agents/index.html b/docs/components/agents/index.html index 357caad..7c90e3f 100644 --- a/docs/components/agents/index.html +++ b/docs/components/agents/index.html @@ -117,8 +117,6 @@
  • Conditional Imitation Learning
  • Clipped Proximal Policy Optimization
  • Deep Deterministic Policy Gradient
  • -
  • Twin Delayed Deep Deterministic Policy Gradient
  • -
  • Soft Actor-Critic
  • Direct Future Prediction
  • Double DQN
  • Deep Q Networks
  • @@ -132,6 +130,9 @@
  • Proximal Policy Optimization
  • Rainbow
  • Quantile Regression DQN
  • +
  • Soft Actor-Critic
  • +
  • Twin Delayed Deep Deterministic Policy Gradient
  • +
  • Wolpertinger
  • Architectures
  • @@ -226,8 +227,6 @@ A detailed description of those algorithms can be found by navigating to each of
  • Conditional Imitation Learning
  • Clipped Proximal Policy Optimization
  • Deep Deterministic Policy Gradient
  • -
  • Twin Delayed Deep Deterministic Policy Gradient
  • -
  • Soft Actor-Critic
  • Direct Future Prediction
  • Double DQN
  • Deep Q Networks
  • @@ -241,6 +240,9 @@ A detailed description of those algorithms can be found by navigating to each of
  • Proximal Policy Optimization
  • Rainbow
  • Quantile Regression DQN
  • +
  • Soft Actor-Critic
  • +
  • Twin Delayed Deep Deterministic Policy Gradient
  • +
  • Wolpertinger
  • @@ -512,7 +514,7 @@ given observation

    -prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.array][source]
    +prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.core.multiarray.array][source]

    Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc.

    @@ -652,7 +654,7 @@ dependent on those values, by calling init_environment_dependent_modules

    set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None[source]

    Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the -slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent +slave agent, define its observation, possible actions, etc. The directive type is defined by the agent in-action-space.

    Parameters
    diff --git a/docs/components/agents/policy_optimization/wolpertinger.html b/docs/components/agents/policy_optimization/wolpertinger.html new file mode 100644 index 0000000..cda2378 --- /dev/null +++ b/docs/components/agents/policy_optimization/wolpertinger.html @@ -0,0 +1,276 @@ + + + + + + + + + + + Wolpertinger — Reinforcement Learning Coach 0.12.0 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + +
    +

    Wolpertinger

    +

    Actions space: Discrete

    +

    References: Deep Reinforcement Learning in Large Discrete Action Spaces

    +
    +

    Network Structure

    +../../../_images/wolpertinger.png +
    +
    +

    Algorithm Description

    +
    +

    Choosing an action

    +

    Pass the current states through the actor network, and get a proto action \(\mu\). +While in training phase, use a continuous exploration policy, such as the a gaussian noise, +to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid +action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the +critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen. +When testing, the same flow is used, but no exploration noise is added.

    +
    +
    +

    Training the network

    +

    Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed +through the k-NN tree. It is being passed directly to the critic.

    +

    Start by sampling a batch of transitions from the experience replay.

    +
      +
    • To train the critic network, use the following targets:

      +

      \(y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))\)

      +

      First run the actor target network, using the next states as the inputs, and get \(\mu (s_{t+1} )\). +Next, run the critic target network using the next states and \(\mu (s_{t+1} )\), and use the output to +calculate \(y_t\) according to the equation above. To train the network, use the current states and actions +as the inputs, and \(y_t\) as the targets.

      +
    • +
    • To train the actor network, use the following equation:

      +

      \(\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]\)

      +

      Use the actor’s online network to get the action mean values using the current states as the inputs. +Then, use the critic online network in order to get the gradients of the critic output with respect to the +action mean values \(\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }\). +Using the chain rule, calculate the gradients of the actor’s output, with respect to the actor weights, +given \(\nabla_a Q(s,a)\). Finally, apply those gradients to the actor network.

      +
    • +
    +

    After every training step, do a soft update of the critic and actor target networks’ weights from the online networks.

    +
    +
    +class rl_coach.agents.wolpertinger_agent.WolpertingerAlgorithmParameters[source]
    +
    + +
    +
    +
    + + +
    + +
    +
    + + +
    + +
    +

    + © Copyright 2018-2019, Intel AI Lab + +

    +
    + Built with Sphinx using a theme provided by Read the Docs. + +
    + +
    +
    + +
    + +
    + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/components/spaces.html b/docs/components/spaces.html index a62653c..753e334 100644 --- a/docs/components/spaces.html +++ b/docs/components/spaces.html @@ -442,7 +442,7 @@ The actions will be in the form:

    DiscreteActionSpace

    -class rl_coach.spaces.DiscreteActionSpace(num_actions: int, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None)[source]
    +class rl_coach.spaces.DiscreteActionSpace(num_actions: int, descriptions: Union[None, List, Dict] = None, default_action: numpy.ndarray = None, filtered_action_space=None)[source]

    A discrete action space with action indices as actions

    diff --git a/docs/features/benchmarks.html b/docs/features/benchmarks.html index 073faab..845d42c 100644 --- a/docs/features/benchmarks.html +++ b/docs/features/benchmarks.html @@ -37,7 +37,7 @@ - + @@ -95,7 +95,6 @@
  • Algorithms
  • Environments
  • Benchmarks
  • -
  • Batch Reinforcement Learning
  • Selecting an Algorithm
  • @@ -221,7 +220,7 @@ benchmarks stay intact as Coach continues to develop.

    A

    @@ -956,6 +957,14 @@ +

    W

    + + +
    +
    diff --git a/docs/objects.inv b/docs/objects.inv index b2a5dbc..a793fa3 100644 Binary files a/docs/objects.inv and b/docs/objects.inv differ diff --git a/docs/searchindex.js b/docs/searchindex.js index dfcf9de..760e666 100644 --- a/docs/searchindex.js +++ b/docs/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["components/additional_parameters","components/agents/imitation/bc","components/agents/imitation/cil","components/agents/index","components/agents/other/dfp","components/agents/policy_optimization/ac","components/agents/policy_optimization/acer","components/agents/policy_optimization/cppo","components/agents/policy_optimization/ddpg","components/agents/policy_optimization/hac","components/agents/policy_optimization/pg","components/agents/policy_optimization/ppo","components/agents/policy_optimization/sac","components/agents/policy_optimization/td3","components/agents/value_optimization/bs_dqn","components/agents/value_optimization/categorical_dqn","components/agents/value_optimization/double_dqn","components/agents/value_optimization/dqn","components/agents/value_optimization/dueling_dqn","components/agents/value_optimization/mmc","components/agents/value_optimization/n_step","components/agents/value_optimization/naf","components/agents/value_optimization/nec","components/agents/value_optimization/pal","components/agents/value_optimization/qr_dqn","components/agents/value_optimization/rainbow","components/architectures/index","components/core_types","components/data_stores/index","components/environments/index","components/exploration_policies/index","components/filters/index","components/filters/input_filters","components/filters/output_filters","components/memories/index","components/memory_backends/index","components/orchestrators/index","components/spaces","contributing/add_agent","contributing/add_env","dashboard","design/control_flow","design/horizontal_scaling","design/network","dist_usage","features/algorithms","features/batch_rl","features/benchmarks","features/environments","features/index","index","selecting_an_algorithm","test","usage"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["components/additional_parameters.rst","components/agents/imitation/bc.rst","components/agents/imitation/cil.rst","components/agents/index.rst","components/agents/other/dfp.rst","components/agents/policy_optimization/ac.rst","components/agents/policy_optimization/acer.rst","components/agents/policy_optimization/cppo.rst","components/agents/policy_optimization/ddpg.rst","components/agents/policy_optimization/hac.rst","components/agents/policy_optimization/pg.rst","components/agents/policy_optimization/ppo.rst","components/agents/policy_optimization/sac.rst","components/agents/policy_optimization/td3.rst","components/agents/value_optimization/bs_dqn.rst","components/agents/value_optimization/categorical_dqn.rst","components/agents/value_optimization/double_dqn.rst","components/agents/value_optimization/dqn.rst","components/agents/value_optimization/dueling_dqn.rst","components/agents/value_optimization/mmc.rst","components/agents/value_optimization/n_step.rst","components/agents/value_optimization/naf.rst","components/agents/value_optimization/nec.rst","components/agents/value_optimization/pal.rst","components/agents/value_optimization/qr_dqn.rst","components/agents/value_optimization/rainbow.rst","components/architectures/index.rst","components/core_types.rst","components/data_stores/index.rst","components/environments/index.rst","components/exploration_policies/index.rst","components/filters/index.rst","components/filters/input_filters.rst","components/filters/output_filters.rst","components/memories/index.rst","components/memory_backends/index.rst","components/orchestrators/index.rst","components/spaces.rst","contributing/add_agent.rst","contributing/add_env.rst","dashboard.rst","design/control_flow.rst","design/horizontal_scaling.rst","design/network.rst","dist_usage.rst","features/algorithms.rst","features/batch_rl.rst","features/benchmarks.rst","features/environments.rst","features/index.rst","index.rst","selecting_an_algorithm.rst","test.rst","usage.rst"],objects:{"rl_coach.agents.acer_agent":{ACERAlgorithmParameters:[6,0,1,""]},"rl_coach.agents.actor_critic_agent":{ActorCriticAlgorithmParameters:[5,0,1,""]},"rl_coach.agents.agent":{Agent:[3,0,1,""]},"rl_coach.agents.agent.Agent":{act:[3,1,1,""],call_memory:[3,1,1,""],choose_action:[3,1,1,""],collect_savers:[3,1,1,""],create_networks:[3,1,1,""],freeze_memory:[3,1,1,""],get_predictions:[3,1,1,""],get_state_embedding:[3,1,1,""],handle_episode_ended:[3,1,1,""],init_environment_dependent_modules:[3,1,1,""],initialize_session_dependent_components:[3,1,1,""],learn_from_batch:[3,1,1,""],load_memory_from_file:[3,1,1,""],log_to_screen:[3,1,1,""],observe:[3,1,1,""],parent:[3,1,1,""],phase:[3,1,1,""],post_training_commands:[3,1,1,""],prepare_batch_for_inference:[3,1,1,""],register_signal:[3,1,1,""],reset_evaluation_state:[3,1,1,""],reset_internal_state:[3,1,1,""],restore_checkpoint:[3,1,1,""],run_off_policy_evaluation:[3,1,1,""],run_pre_network_filter_for_inference:[3,1,1,""],save_checkpoint:[3,1,1,""],set_environment_parameters:[3,1,1,""],set_incoming_directive:[3,1,1,""],set_session:[3,1,1,""],setup_logger:[3,1,1,""],sync:[3,1,1,""],train:[3,1,1,""],update_log:[3,1,1,""],update_step_in_episode_log:[3,1,1,""],update_transition_before_adding_to_replay_buffer:[3,1,1,""]},"rl_coach.agents.bc_agent":{BCAlgorithmParameters:[1,0,1,""]},"rl_coach.agents.categorical_dqn_agent":{CategoricalDQNAlgorithmParameters:[15,0,1,""]},"rl_coach.agents.cil_agent":{CILAlgorithmParameters:[2,0,1,""]},"rl_coach.agents.clipped_ppo_agent":{ClippedPPOAlgorithmParameters:[7,0,1,""]},"rl_coach.agents.ddpg_agent":{DDPGAlgorithmParameters:[8,0,1,""]},"rl_coach.agents.dfp_agent":{DFPAlgorithmParameters:[4,0,1,""]},"rl_coach.agents.dqn_agent":{DQNAgent:[52,0,1,""],DQNAlgorithmParameters:[17,0,1,""]},"rl_coach.agents.dqn_agent.DQNAgent":{act:[52,1,1,""],call_memory:[52,1,1,""],choose_action:[52,1,1,""],collect_savers:[52,1,1,""],create_networks:[52,1,1,""],freeze_memory:[52,1,1,""],get_predictions:[52,1,1,""],get_state_embedding:[52,1,1,""],handle_episode_ended:[52,1,1,""],improve_reward_model:[52,1,1,""],init_environment_dependent_modules:[52,1,1,""],initialize_session_dependent_components:[52,1,1,""],learn_from_batch:[52,1,1,""],load_memory_from_file:[52,1,1,""],log_to_screen:[52,1,1,""],observe:[52,1,1,""],parent:[52,1,1,""],phase:[52,1,1,""],post_training_commands:[52,1,1,""],prepare_batch_for_inference:[52,1,1,""],register_signal:[52,1,1,""],reset_evaluation_state:[52,1,1,""],reset_internal_state:[52,1,1,""],restore_checkpoint:[52,1,1,""],run_off_policy_evaluation:[52,1,1,""],run_pre_network_filter_for_inference:[52,1,1,""],save_checkpoint:[52,1,1,""],set_environment_parameters:[52,1,1,""],set_incoming_directive:[52,1,1,""],set_session:[52,1,1,""],setup_logger:[52,1,1,""],sync:[52,1,1,""],train:[52,1,1,""],update_log:[52,1,1,""],update_step_in_episode_log:[52,1,1,""],update_transition_before_adding_to_replay_buffer:[52,1,1,""]},"rl_coach.agents.mmc_agent":{MixedMonteCarloAlgorithmParameters:[19,0,1,""]},"rl_coach.agents.n_step_q_agent":{NStepQAlgorithmParameters:[20,0,1,""]},"rl_coach.agents.naf_agent":{NAFAlgorithmParameters:[21,0,1,""]},"rl_coach.agents.nec_agent":{NECAlgorithmParameters:[22,0,1,""]},"rl_coach.agents.pal_agent":{PALAlgorithmParameters:[23,0,1,""]},"rl_coach.agents.policy_gradients_agent":{PolicyGradientAlgorithmParameters:[10,0,1,""]},"rl_coach.agents.ppo_agent":{PPOAlgorithmParameters:[11,0,1,""]},"rl_coach.agents.qr_dqn_agent":{QuantileRegressionDQNAlgorithmParameters:[24,0,1,""]},"rl_coach.agents.rainbow_dqn_agent":{RainbowDQNAlgorithmParameters:[25,0,1,""]},"rl_coach.agents.soft_actor_critic_agent":{SoftActorCriticAlgorithmParameters:[12,0,1,""]},"rl_coach.agents.td3_agent":{TD3AlgorithmParameters:[13,0,1,""]},"rl_coach.architectures.architecture":{Architecture:[26,0,1,""]},"rl_coach.architectures.architecture.Architecture":{accumulate_gradients:[26,1,1,""],apply_and_reset_gradients:[26,1,1,""],apply_gradients:[26,1,1,""],collect_savers:[26,1,1,""],construct:[26,1,1,""],get_variable_value:[26,1,1,""],get_weights:[26,1,1,""],parallel_predict:[26,1,1,""],predict:[26,1,1,""],reset_accumulated_gradients:[26,1,1,""],set_variable_value:[26,1,1,""],set_weights:[26,1,1,""],train_on_batch:[26,1,1,""]},"rl_coach.architectures.network_wrapper":{NetworkWrapper:[26,0,1,""]},"rl_coach.architectures.network_wrapper.NetworkWrapper":{apply_gradients_and_sync_networks:[26,1,1,""],apply_gradients_to_global_network:[26,1,1,""],apply_gradients_to_online_network:[26,1,1,""],collect_savers:[26,1,1,""],parallel_prediction:[26,1,1,""],set_is_training:[26,1,1,""],sync:[26,1,1,""],train_and_sync_networks:[26,1,1,""],update_online_network:[26,1,1,""],update_target_network:[26,1,1,""]},"rl_coach.base_parameters":{AgentParameters:[3,0,1,""],DistributedTaskParameters:[0,0,1,""],NetworkParameters:[26,0,1,""],PresetValidationParameters:[0,0,1,""],TaskParameters:[0,0,1,""],VisualizationParameters:[0,0,1,""]},"rl_coach.core_types":{ActionInfo:[27,0,1,""],Batch:[27,0,1,""],EnvResponse:[27,0,1,""],Episode:[27,0,1,""],Transition:[27,0,1,""]},"rl_coach.core_types.Batch":{actions:[27,1,1,""],game_overs:[27,1,1,""],goals:[27,1,1,""],info:[27,1,1,""],info_as_list:[27,1,1,""],n_step_discounted_rewards:[27,1,1,""],next_states:[27,1,1,""],rewards:[27,1,1,""],shuffle:[27,1,1,""],size:[27,1,1,""],slice:[27,1,1,""],states:[27,1,1,""]},"rl_coach.core_types.Episode":{get_first_transition:[27,1,1,""],get_last_transition:[27,1,1,""],get_transition:[27,1,1,""],get_transitions_attribute:[27,1,1,""],insert:[27,1,1,""],is_empty:[27,1,1,""],length:[27,1,1,""],update_discounted_rewards:[27,1,1,""]},"rl_coach.data_stores.nfs_data_store":{NFSDataStore:[28,0,1,""]},"rl_coach.data_stores.s3_data_store":{S3DataStore:[28,0,1,""]},"rl_coach.environments.carla_environment":{CarlaEnvironment:[29,0,1,""]},"rl_coach.environments.control_suite_environment":{ControlSuiteEnvironment:[29,0,1,""]},"rl_coach.environments.doom_environment":{DoomEnvironment:[29,0,1,""]},"rl_coach.environments.environment":{Environment:[29,0,1,""]},"rl_coach.environments.environment.Environment":{action_space:[29,1,1,""],close:[29,1,1,""],get_action_from_user:[29,1,1,""],get_available_keys:[29,1,1,""],get_goal:[29,1,1,""],get_random_action:[29,1,1,""],get_rendered_image:[29,1,1,""],goal_space:[29,1,1,""],handle_episode_ended:[29,1,1,""],last_env_response:[29,1,1,""],phase:[29,1,1,""],render:[29,1,1,""],reset_internal_state:[29,1,1,""],set_goal:[29,1,1,""],state_space:[29,1,1,""],step:[29,1,1,""]},"rl_coach.environments.gym_environment":{GymEnvironment:[29,0,1,""]},"rl_coach.environments.starcraft2_environment":{StarCraft2Environment:[29,0,1,""]},"rl_coach.exploration_policies.additive_noise":{AdditiveNoise:[30,0,1,""]},"rl_coach.exploration_policies.boltzmann":{Boltzmann:[30,0,1,""]},"rl_coach.exploration_policies.bootstrapped":{Bootstrapped:[30,0,1,""]},"rl_coach.exploration_policies.categorical":{Categorical:[30,0,1,""]},"rl_coach.exploration_policies.continuous_entropy":{ContinuousEntropy:[30,0,1,""]},"rl_coach.exploration_policies.e_greedy":{EGreedy:[30,0,1,""]},"rl_coach.exploration_policies.exploration_policy":{ExplorationPolicy:[30,0,1,""]},"rl_coach.exploration_policies.exploration_policy.ExplorationPolicy":{change_phase:[30,1,1,""],get_action:[30,1,1,""],requires_action_values:[30,1,1,""],reset:[30,1,1,""]},"rl_coach.exploration_policies.greedy":{Greedy:[30,0,1,""]},"rl_coach.exploration_policies.ou_process":{OUProcess:[30,0,1,""]},"rl_coach.exploration_policies.parameter_noise":{ParameterNoise:[30,0,1,""]},"rl_coach.exploration_policies.truncated_normal":{TruncatedNormal:[30,0,1,""]},"rl_coach.exploration_policies.ucb":{UCB:[30,0,1,""]},"rl_coach.filters.action":{AttentionDiscretization:[33,0,1,""],BoxDiscretization:[33,0,1,""],BoxMasking:[33,0,1,""],FullDiscreteActionSpaceMap:[33,0,1,""],LinearBoxToBoxMap:[33,0,1,""],PartialDiscreteActionSpaceMap:[33,0,1,""]},"rl_coach.filters.observation":{ObservationClippingFilter:[32,0,1,""],ObservationCropFilter:[32,0,1,""],ObservationMoveAxisFilter:[32,0,1,""],ObservationNormalizationFilter:[32,0,1,""],ObservationRGBToYFilter:[32,0,1,""],ObservationReductionBySubPartsNameFilter:[32,0,1,""],ObservationRescaleSizeByFactorFilter:[32,0,1,""],ObservationRescaleToSizeFilter:[32,0,1,""],ObservationSqueezeFilter:[32,0,1,""],ObservationStackingFilter:[32,0,1,""],ObservationToUInt8Filter:[32,0,1,""]},"rl_coach.filters.reward":{RewardClippingFilter:[32,0,1,""],RewardNormalizationFilter:[32,0,1,""],RewardRescaleFilter:[32,0,1,""]},"rl_coach.memories.backend.redis":{RedisPubSubBackend:[35,0,1,""]},"rl_coach.memories.episodic":{EpisodicExperienceReplay:[34,0,1,""],EpisodicHRLHindsightExperienceReplay:[34,0,1,""],EpisodicHindsightExperienceReplay:[34,0,1,""],SingleEpisodeBuffer:[34,0,1,""]},"rl_coach.memories.non_episodic":{BalancedExperienceReplay:[34,0,1,""],ExperienceReplay:[34,0,1,""],PrioritizedExperienceReplay:[34,0,1,""],QDND:[34,0,1,""],TransitionCollection:[34,0,1,""]},"rl_coach.orchestrators.kubernetes_orchestrator":{Kubernetes:[36,0,1,""]},"rl_coach.spaces":{ActionSpace:[37,0,1,""],AttentionActionSpace:[37,0,1,""],BoxActionSpace:[37,0,1,""],CompoundActionSpace:[37,0,1,""],DiscreteActionSpace:[37,0,1,""],GoalsSpace:[37,0,1,""],ImageObservationSpace:[37,0,1,""],MultiSelectActionSpace:[37,0,1,""],ObservationSpace:[37,0,1,""],PlanarMapsObservationSpace:[37,0,1,""],Space:[37,0,1,""],VectorObservationSpace:[37,0,1,""]},"rl_coach.spaces.ActionSpace":{clip_action_to_space:[37,1,1,""],contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""],sample_with_info:[37,1,1,""]},"rl_coach.spaces.GoalsSpace":{DistanceMetric:[37,0,1,""],clip_action_to_space:[37,1,1,""],contains:[37,1,1,""],distance_from_goal:[37,1,1,""],get_reward_for_goal_and_state:[37,1,1,""],goal_from_state:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""],sample_with_info:[37,1,1,""]},"rl_coach.spaces.ObservationSpace":{contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""]},"rl_coach.spaces.Space":{contains:[37,1,1,""],is_valid_index:[37,1,1,""],sample:[37,1,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"]},objtypes:{"0":"py:class","1":"py:method"},terms:{"100x100":33,"160x160":32,"1_0":[15,25],"1st":30,"20x20":33,"210x160":32,"2nd":30,"50k":41,"9_amd64":44,"abstract":[38,42],"boolean":[3,27,37,52],"break":40,"case":[0,3,5,22,26,27,30,37,46,51,52,53],"class":[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,41,45,52],"default":[0,30,53],"enum":[26,29,37],"export":[0,26,44],"final":[8,13,16,17,19,23,41],"float":[3,4,5,6,7,8,10,11,12,13,15,19,22,23,24,26,27,29,30,32,33,34,37,38,52],"function":[0,1,3,6,7,8,11,13,26,29,30,37,38,39,41,43,52],"import":[6,18,30,34,39,51,53],"int":[0,3,4,5,6,7,10,15,20,22,24,25,27,29,30,32,33,34,37,52],"long":43,"new":[0,3,7,8,11,12,13,22,23,26,27,33,41,42,46,50,51,52],"return":[0,3,8,10,11,13,14,19,22,23,25,26,27,29,30,32,34,37,38,39,41,51,52],"short":[0,41],"static":26,"super":[38,39],"switch":[0,40],"true":[0,3,4,5,6,7,8,11,12,13,22,23,25,26,27,29,30,33,34,37,52],"try":[4,47,51],"while":[0,5,6,8,9,10,11,12,13,26,29,40,43,51,53],AWS:44,Adding:[18,50],And:[39,51],But:[40,51],Doing:51,For:[0,1,2,3,4,7,10,14,15,16,17,20,22,23,26,27,29,30,31,32,33,37,38,39,41,42,43,44,47,52,53],Has:26,Its:52,NFS:[28,44],One:[24,51,53],That:40,The:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,40,41,42,43,44,47,48,50,51,52,53],Then:[4,7,8,13,14,21,23],There:[7,11,26,30,31,38,39,43,46,53],These:[1,2,3,24,29,36,42,43,44],Use:[1,2,8,13,21,22],Used:30,Uses:51,Using:[8,13,14,16,17,44],Will:26,With:[30,50],__init__:[29,38,39],_index:[5,20],_nois:13,_render:39,_restart_environment_episod:39,_take_act:39,_update_st:39,a2c:51,a3c:[10,20,40,51],a_i:22,a_t:[4,5,6,8,12,13,14,15,16,17,19,20,21,23,25],a_valu:5,abl:[33,51],about:[3,27,41,52,53],abov:[8,12,13,26,41],abs:[20,34],absolut:30,acceler:21,accept:29,access:[26,38,44],accord:[0,3,4,5,6,8,12,13,14,20,26,27,30,37,40,41,43,52],accordingli:[22,37,41,53],account:[4,7,11,22,23,30],accumul:[3,4,5,6,10,20,22,25,26,32,51,52],accumulate_gradi:26,accumulated_gradi:26,accur:51,acer:[3,51],acer_ag:6,aceralgorithmparamet:6,achiev:[0,4,7,29,32,34,37,47,51,53],acquir:12,across:[10,19,40],act:[3,4,8,13,14,24,37,38,41,52],action:[1,2,3,15,16,17,18,19,20,23,24,25,26,27,29,30,31,34,38,39,41,43,52],action_idx:39,action_penalti:[8,13],action_spac:[29,30],action_space_s:26,action_valu:[27,30],actioninfo:[3,37,41,52],actionspac:[30,37],actiontyp:39,activ:[8,13,26],actor:[3,6,7,8,11,13,30,43,51],actor_critic_ag:5,actorcriticag:38,actorcriticalgorithmparamet:5,actual:[4,5,15,16,17,24,25,30,33,34],adam:[7,26],adam_optimizer_beta1:26,adam_optimizer_beta2:26,adapt:[7,11],add:[8,9,13,21,26,27,30,32,39,41,44,51],add_rendered_image_to_env_respons:0,added:[0,4,6,7,10,11,22,30,34,38],adding:[3,11,30,38,52],addit:[3,26,27,29,30,32,34,37,39,40,41,43,50,51,52],addition:[26,29,32,38,39,41,47,48,53],additional_fetch:26,additional_input:26,additional_simulator_paramet:[29,39],additionali:40,additive_nois:30,additivenoiseparamet:30,address:13,advanc:[25,50],advantag:[3,5,7,11,18,30],affect:[0,14,26],aforement:[16,17,23],after:[0,3,8,10,11,12,20,21,23,25,26,27,29,32,37,52,53],again:30,against:3,agent:[0,1,2,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,29,30,31,32,33,37,39,40,43,45,47,50,51,52],agent_param:42,agent_paramet:[3,26,52],agentparamet:[3,26,38],aggreg:41,ahead:[4,51],aim:30,algorithm:[3,27,30,38,40,41,42,46,47,49,50,52],algorithmparamet:[3,38],all:[0,3,10,14,22,23,26,27,29,30,32,33,37,38,39,40,41,42,43,44,46,48,52,53],all_action_prob:27,allow:[0,3,4,13,18,26,27,29,30,31,32,33,34,40,41,42,43,50,51,52,53],allow_brak:29,allow_duplicates_in_batch_sampl:34,allow_no_action_to_be_select:37,almost:46,along:[22,29,30,48],alpha:[6,19,23,34],alreadi:[22,27,39,51],also:[5,6,7,22,23,26,29,37,38,40,46,47,51,53],altern:[29,39,48],alwai:[26,30,33],amazon:44,amazonaw:44,amount:[8,10,13,19,23,30,41,51],analysi:40,analyz:40,ani:[3,26,27,29,33,34,38,41,42,43,44,46,52],anoth:[3,18,26,31,52],answer:51,anymor:[3,52],api:[29,43,48,50],appear:[3,52],appli:[0,3,5,8,10,13,20,26,27,30,32,51,52],applic:51,apply_and_reset_gradi:26,apply_gradi:26,apply_gradients_and_sync_network:26,apply_gradients_every_x_episod:[5,10,20],apply_gradients_to_global_network:26,apply_gradients_to_online_network:26,apply_stop_condit:0,appropri:44,approx:[8,12,13],approxim:[12,13,43,51],apt:44,arbitrari:32,architectur:[3,18,38,50,52],architecture_num_q_head:30,area:33,arg:[3,26,44,52],argmax_a:[16,19,23],argument:[3,15,25,26,29,37,41,52],around:[26,27,43],arrai:[3,26,27,29,32,37,39,52],art:[3,45],artifact:44,artifici:34,arxiv:[20,34],aspect:[30,32,40],assign:[0,2,5,6,26,30],assign_kl_coeffici:26,assign_op:26,assum:[27,30,32,34,51],async:[26,42],async_train:26,asynchron:[5,20,26],atari:[17,29,32,44,53],atari_a3c:53,atari_dqn:53,ath:18,atom:[15,24,25],attach:29,attempt:0,attend:33,attent:33,attentionactionspac:33,attentiondiscret:33,attribut:27,attribute_nam:27,author:[29,47,48],auto_select_all_armi:29,autoclean:44,automat:[26,53],autonom:[29,48,50],autoremov:44,auxiliari:[29,48],avail:[4,26,27,29,30,40,42,44,50,51,53],averag:[6,7,11,26,40,41],avg:6,aws:44,axes:[32,40],axi:[32,40],axis_origin:32,axis_target:32,back:[7,42],backend:[26,42,44,50,53],background:53,backpropag:22,backward:26,balanc:2,band:40,bar:6,base1:44,base64:44,base:[7,11,12,19,21,23,29,34,38,41,44,46,48,51,52],base_paramet:[0,3,26,29,30],baselin:51,basic:[10,27,42,53],batch:[1,2,3,4,5,6,8,10,11,12,13,14,15,16,17,18,20,23,24,25,26,34,38,41,49,50,52],batch_siz:26,batchnorm:26,bc_agent:1,bcalgorithmparamet:1,becaus:41,becom:[8,13,42],been:[18,27,32,47,51],befor:[0,3,5,11,13,25,26,27,32,41,42,43,44,51,52],begin:[0,4,41],behav:37,behavior:[3,32,34,38,47,51,52,53],being:[3,38,50,51,52],bellman:[15,24,25],benchmark:[40,49,50,51],benefici:46,best:[51,53],beta1:26,beta2:26,beta:[6,8,10,13,34],beta_entropi:[5,6,7,10,11],better:[13,18,46,51],between:[0,1,2,3,6,7,8,10,11,12,13,15,19,20,22,24,25,26,27,29,30,33,34,37,38,40,41,43,50,51],bfg:[7,11],bia:[6,51],big:[11,15,25],bin:[33,44],binari:14,bind:26,binomi:14,bit:32,blizzard:48,blob:[29,32],block:50,blog:50,boilerpl:41,bolling:40,bool:[0,3,4,5,6,7,8,11,12,13,22,23,25,26,27,29,30,34,37,52],boost:[44,51],bootstrap:[3,5,6,7,8,11,13,19,20,22,23,25,27,51],bootstrap_total_return_from_old_polici:[22,27],both:[3,7,26,29,30,33,51,52],bound:[6,7,11,15,25,30,37,51],box2d:44,box:[30,33,37],boxactionspac:33,boxdiscret:33,boxmask:33,breakout:53,breakoutdeterminist:[29,53],bring:11,bucket:44,buffer:[1,2,3,6,12,14,15,16,17,20,22,23,24,25,34,41,51,52,53],build:[31,50,51],builder:44,built:[38,41],bullet:6,button:[40,53],c51:15,cach:44,cadenc:13,calcul:[3,4,5,6,7,8,10,11,13,14,15,16,17,19,20,22,23,24,25,26,27,30,34,38,52],call:[0,3,10,20,26,27,29,41,52],call_memori:[3,52],callabl:37,camera:[29,39],camera_height:29,camera_width:29,cameratyp:[29,39],can:[0,2,3,5,6,7,8,11,12,13,23,26,27,29,30,31,32,33,37,38,39,40,41,43,46,48,50,52,53],cannot:[3,46,52],carla:[32,48],carla_environ:29,carlaenviron:29,carlaenvironmentparamet:29,carlo:[3,23],cartpol:[29,39],cartpole_a3c:53,cartpole_clippedppo:[44,53],cartpole_dqn:53,categor:[3,5,6,51],categori:[31,32],categorical_dqn_ag:15,categoricaldqnalgorithmparamet:15,caus:[32,40],cdot:[5,7,8,10,12,13,14,15,16,17,19,21,23,25],central:[26,40],certainti:30,chain:[8,13],challeng:41,chang:[0,3,6,7,8,11,13,14,18,20,23,30,41,44,52],change_phas:30,channel:[29,32],channels_axi:37,check:[0,3,27,37,52],checkpoint:[0,3,26,28,42,44,52,53],checkpoint_dir:[3,52],checkpoint_prefix:[3,52],checkpoint_restore_dir:[0,53],checkpoint_restore_path:0,checkpoint_save_dir:0,checkpoint_save_sec:0,child:26,chmod:44,choic:[38,44],choos:[3,18,23,30,31,33,37,38,41,43,51,52,53],choose_act:[3,38,41,52],chosen:[3,12,23,30,33,38,52],chunk:11,cil:51,cil_ag:2,cilalgorithmparamet:2,classic_control:44,clean:[29,38,44],cli:44,clip:[3,6,8,11,13,26,32,37,51],clip_action_to_spac:37,clip_critic_target:[8,13],clip_gradi:26,clip_high:30,clip_likelihood_ratio_using_epsilon:[7,11],clip_low:30,clip_max:32,clip_min:32,clipbyglobalnorm:26,clipped_ppo_ag:7,clippedppoalgorithmparamet:7,clipping_high:32,clipping_low:32,clone:[3,51],close:29,cmake:44,coach:[0,3,26,28,29,30,31,35,36,38,41,45,46,47,48,51,53],code:[39,41,51],coeffici:[7,11,26,30,34],collect:[3,7,10,11,20,26,27,34,41,46,47,50,52,53],collect_sav:[3,26,52],color:32,com:44,combin:[25,43,50,51],comma:0,command:[41,44,53],common:[38,40,44,53],commun:42,compar:[0,11,18,51],complet:[27,30,41],complex:[26,31,41,43,51,53],compon:[3,15,25,26,30,36,38,41,50,52,53],composit:[3,52],compositeag:[3,52],comput:[26,30],concat:26,concentr:41,condit:[0,3],confid:30,config:[29,53],configur:[3,5,10,38,44,52],confus:41,connect:[12,26],connectionist:10,consecut:[8,13,22],consequ:[20,30],consid:[5,6,30,33,40],consist:[8,13,29,32,33,37,41,48],constant:6,constantli:53,constantschedul:34,constrain:33,construct:[12,26,34],consumpt:32,contain:[0,1,2,3,14,26,27,29,37,39,41,52,53],content:44,contin:42,continu:[1,2,5,8,9,10,13,21,30,31,33,37,47],continuous_entropi:30,continuous_exploration_policy_paramet:30,contribut:[4,50],control:[2,3,5,6,7,8,11,26,30,32,40,48,50,51,52],control_suite_environ:29,controlsuiteenviron:29,conveni:[40,53],converg:10,convers:31,convert:[3,27,30,32,37,41,43,52],convolut:[26,43],coordin:33,copi:[8,12,13,14,15,16,17,19,20,21,23,24,25,26,44],core:50,core_typ:[3,27,29,37,52,53],correct:[3,6,51],correctli:26,correl:30,correpond:27,correspond:[2,3,4,15,16,26,27,30,32,37,39,52],could:[3,26,37,44,46,52],count:19,countabl:33,counter:[3,52],counterpart:43,cpu:[0,26],crd:53,creat:[3,20,26,32,39,52,53],create_network:[3,52],create_target_network:26,creation:[3,52],credenti:44,critic:[3,6,7,8,11,13,30,43,51],crop:[32,33],crop_high:32,crop_low:32,cross:[1,15,25],csv:0,ctrl:40,cuda:44,cudnn7:44,curl:44,curr_stat:[3,38,52],current:[0,1,2,3,4,6,7,8,9,10,11,12,13,14,16,17,19,21,22,23,24,26,27,29,30,32,33,37,38,41,50,51,52],custom:[29,30,37,38,41],custom_reward_threshold:29,cycl:41,dai:53,dashboard:[0,3,44,50,52],data:[0,3,10,20,26,34,41,42,44,46,47,50,51,52,53],data_stor:[28,44],dataset:[3,7,11,46,51,52,53],date:[22,43,51,53],dcp:[44,53],ddpg:51,ddpg_agent:8,ddpgalgorithmparamet:8,ddqn:[19,23,51],deal:51,debug:[0,40,50],decai:[5,7,11,26],decid:[0,3,4,29,30,38,52],decis:[3,52],declar:0,decod:44,dedic:26,deep:[0,3,5,12,14,16,18,20,21,25,52],deepmind:48,def:[38,39],default_act:37,default_input_filt:39,default_output_filt:39,defin:[0,3,5,6,7,10,11,12,20,22,23,26,27,29,30,32,33,34,37,38,39,41,42,43,47,48,52,53],definit:[3,26,29,37,39,41,52],delai:[3,51],delta:[6,15,22,25],demonstr:[1,2,53],dens:30,densiti:19,depecr:0,depend:[0,3,6,26,32,34,37,39,44,47,51,52],deploi:[36,42,46],depth:29,descend:51,describ:[3,15,24,32,34,38,41,44,52],descript:[3,33,37,45,53],design:[41,44,50],desir:[33,38],destabil:10,detail:[3,27,45,46,48,50,53],determin:[2,3,22,27,34,52],determinist:[3,12,51],dev:44,develop:[41,47],deviat:[10,11,30,32,40],devic:26,dfp:51,dfp_agent:4,dfpalgorithmparamet:4,dict:[3,4,26,27,29,30,37,52],dict_siz:34,dictat:4,dictionari:[2,3,26,27,29,34,37,38,52],did:29,differ:[0,1,2,3,4,5,6,7,10,11,14,18,26,29,30,32,37,38,39,40,42,43,50,51,52],differenti:18,difficult:[40,47],difficulti:53,dimens:[27,29,32,33],dimension:[11,33],dir:[0,3,52,53],direct:[3,29,52],directli:[3,5,41,43,52],directori:[0,26,38,40,44,53],disabl:53,disable_fog:29,disappear:29,disassembl:51,discard:[27,32],discount:[8,10,11,13,19,22,23,25,26,27,51],discret:[1,2,4,7,11,14,15,16,17,18,19,20,22,23,24,25,30,31,32,33,37,41],disentangl:41,disk:0,displai:[0,40],distanc:37,distance_from_go:37,distance_metr:37,distancemetr:37,distil:[3,52],distribut:[5,6,10,11,12,15,24,25,26,28,30,35,36,37,43,50,51,53],distributed_coach:42,distributed_coach_synchronization_typ:42,distributedcoachsynchronizationtyp:42,divereg:[7,11],diverg:[6,7,11,25],dnd:[0,22,51],dnd_key_error_threshold:22,dnd_size:22,do_action_hindsight:34,doc:44,docker:44,dockerfil:44,document:48,doe:[14,26,32],doesn:42,doing:[7,11,31],domain:43,don:[4,30,40,51],done:[0,3,7,10,11,13,29,32,39,52,53],doom:[29,39,44,48],doom_basic_bc:53,doom_basic_dqn:53,doom_environ:[29,39,53],doomenviron:[29,39],doomenvironmentparamet:[39,53],doominputfilt:39,doomlevel:29,doomoutputfilt:39,doubl:[3,19,25],doubli:52,down:[26,29,51],download:44,dpkg:44,dqn:[3,19,20,25,29,30,32,33,41,43,51],dqn_agent:[17,52],dqnagent:52,dqnalgorithmparamet:17,drive:[2,29,48,50],driving_benchmark:29,due:32,duel:[3,25],dump:[0,3,52],dump_csv:0,dump_gif:0,dump_in_episode_sign:0,dump_mp4:0,dump_one_value_per_episod:[3,52],dump_one_value_per_step:[3,52],dump_parameters_document:0,dump_signals_to_csv_every_x_episod:0,dure:[3,6,7,10,11,12,14,22,30,40,41,52,53],dynam:[40,47,51],e_greedi:30,each:[0,1,2,3,4,5,6,7,10,11,12,14,16,17,18,20,22,23,24,26,27,29,30,31,32,33,34,37,38,40,41,42,43,44,47,51,52],eas:40,easi:[39,40,50],easier:43,easili:[30,46,53],echo:44,effect:[0,3,6,7,20,32,41,52],effici:[6,41,51],either:[0,3,5,20,26,30,37,40,43,53],element:[3,14,26,32,37],elf:44,embbed:26,embed:[3,22,26,52],embedd:[26,43],embedding_merger_typ:26,embeddingmergertyp:26,emploi:51,empti:27,emul:6,enabl:[26,43,53],encod:[32,37],encourag:[21,23,41],end:[2,3,10,25,27,29,32,52,53],enforc:33,engin:[29,48],enough:[4,6,22],ensembl:[30,51],ensur:[6,26],enter:[3,52,53],entir:[11,19,22,25,30,33,41],entri:[22,41],entropi:[1,5,6,7,10,11,12,15,25,30,51],enumer:37,env:[27,44],env_param:39,env_respons:[3,52],enviorn:29,environ:[0,3,4,6,18,26,27,30,31,32,33,37,38,41,44,47,49,50,52],environmentparamet:[29,39],envrespons:[0,3,29,52],episod:[0,3,4,5,10,11,14,19,20,25,29,30,38,39,40,41,42,52,53],episode_max_tim:29,episodic_hindsight_experience_replai:34,epoch:[7,52],epsilon:[7,30,34],epsilon_schedul:30,equal:2,equat:[8,12,13,16,17,20,24],error:[13,26,51],escap:53,especi:18,essenti:[20,26,33,39,41,44],estim:[3,5,7,11,14,19,23,30,52],estimate_state_value_using_ga:[5,7,11],eta:[7,11],etc:[0,3,26,29,31,37,38,48,52],evalu:[0,3,12,26,27,30,41,46,52],evaluate_onli:0,evaluation_epsilon:30,evaluation_nois:30,even:[18,26,29,39,40,41,46,51],everi:[0,5,6,8,10,12,13,14,15,16,17,19,20,21,23,24,25,53],exact:[22,30,47],exactli:26,exampl:[2,3,4,26,27,29,30,31,32,33,37,38,39,41,43,46,52,53],except:[20,27],execut:[27,40,41],exercis:13,exhibit:[3,38,52],exist:[22,26],exit:[3,52],expand_dim:27,expect:[0,3,30,47,52],experi:[0,6,8,11,12,13,25,29,34,35,40,41,42,44,46,50,51,53],experiment_path:[0,29],experiment_suit:29,experimentsuit:29,expert:[1,2,27,51],exploit:[30,41],explor:[3,4,5,6,7,8,9,11,13,14,19,21,22,38,41,46,50,51],exploration_polici:30,explorationparamet:[3,30,38],exponenti:[6,7,11,25,26],expor:3,export_onnx_graph:0,expos:[40,43,50],extend:[29,30,48],extens:[29,48],extent:53,extern:0,extra:[3,26,27,43,52],extract:[3,21,22,27,32,37,40,41,52],factor:[8,10,11,13,23,25,26,27,30,32],failur:0,faithfulli:40,fake:37,fals:[0,3,8,13,26,27,29,30,33,34,37,39,52],far:[11,32,41,47],faster:[18,51],featur:[8,13,29,43,50,51],feature_minimap_maps_to_us:29,feature_screen_maps_to_us:29,fetch:[26,27],fetched_tensor:26,few:[10,14,15,16,17,19,23,24,25,30,39],field:[47,50],file:[0,3,38,41,52,53],fill:[27,39],filter:[0,3,50,52],find:[16,40,48,50],finish:[22,53],finit:33,first:[0,8,11,13,14,22,24,25,26,27,32,41,43],fit:[13,37],fix:46,flag:[0,3,26,27,29,52],flexibl:42,flicker:29,flow:[31,50],follow:[2,3,5,6,8,10,12,13,15,16,17,20,21,22,24,25,26,27,29,30,34,38,39,44,47,51,52],footprint:32,forc:[26,29,33,39],force_cpu:26,force_environment_reset:[29,39],force_int_bin:33,forced_attention_s:37,form:[4,20,37,51],format:38,formul:[5,6],forward:[26,30],found:[3,45,46,53],frac:[6,7,12,15,25],fraction:[7,11],frame:[0,29],frame_skip:29,framework:[0,3,26,38,50,52],framework_typ:0,free:[29,48],freeglut3:44,freez:[3,52],freeze_memori:[3,52],frequenc:13,from:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,32,33,35,37,38,39,40,41,42,43,44,46,47,48,50,52,53],full:[3,10,19,33,52],fulldiscreteactionspacemap:33,fulli:26,func:[3,52],further:46,futur:[0,3,10,27,51],future_measurements_weight:4,gae:[5,7,11],gae_lambda:[5,7,11],game:[3,27,29,48,50,52,53],game_ov:27,gamma:[5,6,8,12,13,14,15,16,17,19,20,21,22,23,25],gap:[23,51],gather:42,gaussian:[11,12,13,30],gener:[0,5,7,11,14,26,29,30,34,37,38,44,53],general_network:38,get:[3,4,7,8,9,10,11,13,14,16,17,19,21,23,26,27,29,30,37,41,43,44,47,52],get_act:30,get_action_from_us:29,get_available_kei:29,get_first_transit:27,get_goal:29,get_last_env_respons:29,get_last_transit:27,get_output_head:38,get_predict:[3,52],get_random_act:29,get_rendered_imag:[29,39],get_reward_for_goal_and_st:37,get_state_embed:[3,52],get_transit:27,get_transitions_attribut:27,get_variable_valu:26,get_weight:26,gfortran:44,gif:0,git:44,github:[39,44,47,50],given:[0,1,2,3,4,5,8,10,11,13,26,27,29,30,32,33,34,37,38,41,46,52],given_weight:26,global:[3,26,43,52],global_network:26,glx:44,goal:[1,2,3,4,6,26,27,29,34,41,43,51,52],goal_from_st:37,goal_nam:37,goal_spac:29,goal_vector:4,goals_spac:34,goalsspac:[34,37],goaltorewardconvers:37,going:31,good:[39,40],gpu:[0,26],gracefulli:53,gradient:[3,5,6,7,11,20,22,26,38,51,52],gradientclippingmethod:26,gradients_clipping_method:26,granular:34,graph:0,graphmanag:41,grayscal:[32,37],greedili:41,group:40,grow:25,guidelin:51,gym:[44,48],gym_environ:[29,53],gymenviron:29,gymenvironmentparamet:39,hac:51,had:47,hand:[18,32,41,51],handl:4,handle_episode_end:[3,29,52],handling_targets_after_episode_end:4,handlingtargetsafterepisodeend:4,hard:[40,51],harder:[40,46],has:[0,3,18,22,23,27,30,32,41,43,47,51,52],has_glob:26,has_target:26,hat:[6,7,15,25],have:[0,3,4,6,26,29,30,32,33,34,41,43,46,47,52],head:[1,2,3,5,6,10,14,18,21,22,26,30,38,43,52],headparamet:26,heads_paramet:26,health_gath:29,heat:6,heatup:[30,41],help:[23,27,40,41,51],here:[39,41],heurist:[11,30],hide:43,hierarch:[37,41],hierarchi:[3,41,51,52],high:[8,11,13,32,33,37,40],high_i:37,high_kl_penalty_coeffici:11,high_x:37,higher:11,highest:[5,6,10,23,30,32,33,37],highli:[0,39,51],hindsight:[9,34,51],hindsight_goal_selection_method:34,hindsight_transitions_per_regular_transit:34,hindsightgoalselectionmethod:34,hold:[14,26,27,34,40,41,43],horizont:[44,50,53],host:44,hostnam:0,hot:37,how:[4,7,11,30,42,44,51,53],hrl:34,html:44,http:[20,34,44],hub:44,huber:24,huber_loss_interv:24,human:[0,29],human_control:29,hyper:[38,47],hyperparamet:38,ident:26,identifi:[26,37],ies:52,ignor:29,imag:[0,26,29,32,33,37,39,43,53],image1:44,imit:[3,27,45,51],impact:26,implement:[3,7,11,26,28,29,30,34,38,39,42,47,51,53],impli:53,implment:36,importance_weight:26,importance_weight_trunc:6,importantli:41,improv:[5,18,25,29,41,51],improve_reward_model:52,includ:[0,3,4,29,31,32,36,43,48,52,53],incorpor:26,increas:[11,23,32,51],increment:[3,52],index:[0,2,27,29,32,33,34,37],indic:37,inf:[32,37],infer:[3,26,29,46,52],infinit:[0,51],info:[3,14,27,37,39,52],info_as_list:27,inform:[3,4,20,27,29,31,40,41,44,48,52],inherit:[3,38,39],init_environment_dependent_modul:[3,52],initi:[3,4,11,23,26,27,38,41,50,52],initial_feed_dict:26,initial_kl_coeffici:11,initialize_session_dependent_compon:[3,52],innov:51,input:[1,2,3,4,8,13,14,16,17,19,21,22,23,26,31,37,41,43,52],input_embedders_paramet:26,input_high:32,input_low:32,input_space_high:33,input_space_low:33,inputembedderparamet:26,inputfilt:41,insert:[22,27],inspect:0,instal:[44,53],instanc:[3,35,37,43],instanti:[3,29,41],instead:[0,3,7,20,23,26,32,33,41,51,52],instruct:53,intact:[14,47],integ:[0,32,33],integr:[39,41,42,46,50],intel:50,intend:[10,26,30,41],interact:[27,41,42,46,50,53],interest:[26,40],interfac:[29,40,42,48],intermedi:22,intern:[3,10,20,26,27,31,41,52,53],intersect:51,interv:24,intro:50,introduc:51,invers:[29,48],invok:41,involv:38,is_empti:27,is_valid_index:37,item:27,iter:[3,5,6,8,11,13,18,26,52],its:[0,3,15,25,26,27,30,37,41,44,51,52,53],itself:[26,37,53],job:0,job_typ:0,joint:29,json:0,jump:[4,33],jupyt:38,just:[3,11,23,25,39,41,43,46,52,53],kapa:24,keep:[17,27,32,53],kei:[2,22,26,27,29,34,38,40,44,51,53],key_error_threshold:34,key_width:34,keyboard:[29,53],keyword:26,kl_coeffici:26,kl_coefficient_ph:26,know:[3,51,52,53],knowledg:[3,41,52],known:[27,40,47,51],kubeconfig:36,kubernet:44,kubernetes_orchestr:36,kubernetesparamet:36,kwarg:[26,29],l2_norm_added_delta:22,l2_regular:26,lack:40,lamb:30,lambda:[5,7,11,30],lane:2,larg:[30,32,48],larger:26,last:[4,6,11,22,27,29,32],last_env_respons:29,lastli:41,later:[0,3,26,52,53],latest:[20,22,41,44],layer:[26,30,34,41,43],lazi:[27,32],lazystack:32,lbfg:26,ld_library_path:44,lead:30,learn:[0,3,4,5,6,8,9,10,12,14,15,16,17,18,21,24,25,26,27,29,30,32,40,41,43,45,47,48,49,51,52],learn_from_batch:[3,38,41,52],learner:26,learning_r:[26,34],learning_rate_decay_r:26,learning_rate_decay_step:26,least:[43,51],leav:[11,14],left:[2,6,12,51],length:[4,5,7,11,20,22,26,27],less:[18,51],level:[0,3,26,29,39,52,53],levelmanag:[3,41,52],levelselect:29,libatla:44,libav:44,libavformat:44,libbla:44,libboost:44,libbz2:44,libfluidsynth:44,libgl1:44,libglew:44,libgm:44,libgstream:44,libgtk2:44,libgtk:44,libjpeg:44,liblapack:44,libnotifi:44,libopen:44,libosmesa6:44,libportmidi:44,librari:[29,44,48],libsdl1:44,libsdl2:44,libsdl:44,libsm:44,libsmpeg:44,libswscal:44,libtiff:44,libwebkitgtk:44,libwildmidi:44,like:[12,29,37,41,43,44,46,51],likelihood:[7,11],line:[3,41,52,53],linear:33,linearboxtoboxmap:33,linearli:33,list:[0,3,4,26,27,29,30,32,33,37,38,52,53],load:[0,3,40,42,52,53],load_memory_from_fil:[3,52],load_memory_from_file_path:53,local:[3,43,44,52],locat:[24,27,32,51],log:[0,3,5,6,10,12,52],log_to_screen:[3,52],logger:[0,3,52],look:[39,44],loop:41,loss:[1,2,3,6,7,10,11,15,16,17,24,25,26,30,38,43,52],lot:[30,40,46,47,51],low:[8,11,13,32,33,37],low_i:37,low_x:37,lower:[0,34,41],lowest:[32,33,37],lstm:43,lumin:32,lvert:[6,15,25],lvl:53,mai:[0,26,45,53],main:[3,38,41,43,45,52,53],mainli:42,major:30,make:[0,3,26,29,38,40,44,46,47,51,52],manag:[3,26,42,44,52],mandatori:[37,39,43],mani:[3,18,45,47],manner:[11,19,20,23,32,41],manual:44,map:[3,26,29,31,32,33,37,38,52],mark:27,markdown:52,mask:[14,33],masked_target_space_high:33,masked_target_space_low:33,master:[3,41,44,52],match:[2,22,26,37],mathbb:[5,6],mathcal:13,mathop:5,max:[5,6,13,15,20,25,32],max_a:[14,17,22,23],max_action_valu:27,max_episodes_to_achieve_reward:0,max_fps_for_human_control:0,max_kl_diverg:6,max_over_num_fram:29,max_simultaneous_selected_act:37,max_siz:34,max_spe:29,maxim:[4,16],maximum:[0,12,15,17,22,23,27,29,30,32,34,51],mdp:46,mean:[0,2,7,8,9,10,11,12,13,21,26,30,32,33,37,40,46,51],meant:43,measur:[3,4,26,29,32,37,39,51,52],measurements_nam:37,mechan:[31,42,47,53],memor:51,memori:[3,25,27,32,38,41,42,44,50,51,52],memory_backend:44,memorygranular:34,memoryparamet:[3,38],merg:[26,29],mesa:44,method:[0,5,7,11,13,20,26,32,34],metric:[0,37,40],mid:6,middlewar:[22,26,43],middleware_paramet:26,middlewareparamet:26,midpoint:24,might:[3,10,29,38,43,52],min:[6,7,13,15,23,25],min_:[12,13],min_reward_threshold:0,mind:53,minim:[2,4,15],minimap_s:29,minimum:[0,7,13,32],mitig:51,mix:[3,7,11,22,23,51],mixedmontecarloalgorithmparamet:19,mixer1:44,mixtur:[19,26],mjkei:44,mjpro150:44,mjpro150_linux:44,mkdir:44,mmc:[19,51],mmc_agent:19,mode:[23,26,28,35,36,41,42,44,53],model:[0,19,21,26,50,52,53],modif:51,modifi:6,modul:[3,38,41,42,52],modular:[38,41,43,50],monitor:42,mont:[3,23],monte_carlo_mixing_r:[19,23],more:[3,8,13,20,26,32,38,40,41,43,44,46,50,52,53],moreov:40,most:[3,10,22,26,27,30,43,47,51,52,53],mostli:[32,41],motiv:41,move:[6,7,11,32,40,47],mp4:0,mse:[2,6,16,17,24],much:[7,11,41,51],mujoco:[29,33,39,44,48],mujoco_kei:44,mujoco_pi:44,multi:[11,26,37,43],multidimension:37,multipl:[4,7,11,20,26,29,30,32,33,34,37,40,41,47,50,53],multipli:[4,10,26,32],multiselect:33,multitask:[29,48],must:[26,32,37,47],mxnet:53,n_step:[22,25,27,34],n_step_discounted_reward:27,n_step_q_ag:20,nabla:[6,8,13],nabla_:[8,12,13],nabla_a:[8,13],naf:51,naf_ag:21,nafalgorithmparamet:21,name:[3,26,27,29,32,37,38,44,52,53],namespac:36,nasm:44,nativ:[0,29,39,48],native_rend:0,navig:3,ndarrai:[3,26,27,29,30,32,33,37,39,52],nearest:22,neat:40,nec:[0,51],nec_ag:22,necalgorithmparamet:22,necessari:[3,22,26,52],necessarili:32,need:[0,3,6,25,26,29,30,37,38,41,47,51,52,53],neg:[4,32],neighbor:22,neon_compon:38,nervanasystem:44,network:[0,3,26,30,38,41,47,50,51,52,53],network_input_tupl:26,network_nam:[3,52],network_param:30,network_paramet:26,network_wrapp:[3,26,52],networkparamet:[3,26,30,38],networkwrapp:[3,52],neural:[3,19,26,43,47],never:26,new_value_shift_coeffici:[22,34],new_weight:26,newli:[23,39,46,51],next:[0,3,8,13,16,17,21,23,24,27,29,41,52,53],next_stat:27,nfs_data_stor:28,nfsdatastoreparamet:28,nice:53,no_accumul:26,node:[26,43],nois:[8,9,13,21,30,41,51],noise_as_percentage_from_action_spac:30,noise_schedul:30,noisi:[10,25,30],non_episod:34,none:[0,3,7,8,11,13,26,27,29,30,32,33,37,39,52],norm:26,norm_unclipped_grad:26,norm_unclippsed_grad:26,normal:[3,4,10,30,31,32,37],note:[22,26,30,52],notebook:38,notic:[26,51],notori:[40,47,51],now:[7,39],nstepqalgorithmparamet:20,nth:25,num_act:[22,34,37],num_bins_per_dimens:33,num_class:34,num_consecutive_playing_step:[3,8,13,52],num_consecutive_training_step:[3,52],num_gpu:0,num_neighbor:34,num_predicted_steps_ahead:4,num_speedup_step:29,num_steps_between_copying_online_weights_to_target:[8,12,13,20],num_steps_between_gradient_upd:[5,6,10,20],num_task:0,num_training_task:0,num_transitions_to_start_replai:6,num_work:0,number:[0,2,4,5,6,8,10,12,13,14,15,20,22,24,25,26,27,29,30,32,33,34,40,48,52,53],number_of_knn:22,numpi:[3,26,27,29,30,32,33,37,39,52],nvidia:44,object:[0,3,25,26,29,30,32,34,41,52],observ:[0,3,4,11,26,27,29,31,39,41,52],observation_reduction_by_sub_parts_name_filt:32,observation_space_s:26,observation_space_typ:29,observation_stat:32,observation_typ:29,observationspac:37,observationspacetyp:29,observationtyp:29,off:[3,6,12,42,46,51,52],offer:[29,48],often:[40,41,43,46],old:[7,11,26,51],old_weight:26,onc:[0,7,10,11,14,15,16,17,19,20,23,24,25,26,37,53],one:[0,3,6,18,22,23,26,27,29,30,31,34,37,39,40,43,46,51,52],ones:[39,51],onli:[0,3,4,5,6,7,10,11,14,15,17,18,20,22,24,25,26,27,29,30,32,33,39,41,51,52,53],onlin:[8,12,13,14,15,16,17,19,20,21,22,23,24,25,26,41,43,46],online_network:26,onnx:[0,26],onto:31,open:[0,29,48],openai:[44,48],opencv:44,oper:[23,26,32],ops:26,optim:[3,4,6,26,45,51],optimization_epoch:7,optimizer_epsilon:26,optimizer_typ:26,option:[6,10,26,29,33,37,38,40,42,43,53],orchestr:[42,44,50],order:[0,3,5,6,7,8,10,11,12,13,16,17,18,20,21,22,23,24,26,27,31,32,33,40,41,43,46,47,51,52],org:[20,34],origin:[20,32,33,47],ornstein:[8,9,30],other:[0,2,10,18,23,26,29,31,32,34,40,41,51],otherwis:[11,14,26,29,30,37],ou_process:30,our:7,out:[2,16,17,30,31,33,40,44,50,51,53],outcom:[30,41],output:[0,4,6,8,13,14,15,21,22,26,30,31,32,37,38,43],output_0_0:26,output_observation_spac:32,outputfilt:41,outsid:[4,30],over:[3,7,10,11,20,22,25,26,27,30,32,33,40,41,51,52],overestim:[8,13,51],overfit:11,overhead:0,overlai:40,overrid:[3,52],override_existing_kei:34,overriden:38,overview:41,overwhelm:41,overwritten:26,own:[26,38],p_j:[15,25],page:[3,47],pair:[0,37],pal:[23,51],pal_ag:23,pal_alpha:23,palalgorithmparamet:23,paper:[5,10,12,15,20,22,24,29,34,47],parallel:[6,26,40,43],parallel_predict:26,param:[3,26,27,28,29,30,35,36,38,39,52],paramet:[2,3,4,5,6,7,8,10,11,12,13,15,19,20,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,47,50,52,53],parameter_nois:30,parameters_server_host:0,parent:[3,26,52],parent_path_suffix:[3,26,52],parmet:3,pars:41,part:[0,3,14,26,27,30,32,33,42,43,47,51,52],part_nam:32,partial:33,partialdiscreteactionspacemap:33,particular:4,particularli:[29,30,37,47,51],pass:[0,4,8,9,13,21,22,26,29,30,31,39,40,41,43,53],patamet:22,patchelf:44,patchelf_0:44,path:[0,3,26,38,39,44,52,53],pattern:41,pdf:34,penal:[7,8,11,13],penalti:11,pendulum_hac:39,pendulum_with_go:39,pendulumwithgo:39,per:[0,3,4,37,38,41,52],percentag:30,percentil:30,perceptron:43,perform:[0,3,6,26,27,32,34,39,40,41,51,52],period:[43,53],persist:3,persistent_advantage_learn:23,perspect:15,phase:[3,6,7,8,9,11,12,13,26,29,30,41,52],phi:[15,25],physic:[29,48],pi_:[6,7,12],pick:[12,29],pickl:53,pickledreplaybuff:53,pip3:44,pip:44,pixel:29,place:[33,40,41],placehold:[26,30],plai:[0,3,10,14,16,17,20,30,38,40,52],plain:43,planarmap:29,planarmapsobservationspac:32,platform:[29,48],pleas:[20,47],plu:26,plugin:44,point:[32,37,41,42],polici:[1,3,4,5,6,9,12,14,20,21,22,28,38,41,42,43,44,45,46,50,51,52],policy_gradient_rescal:[5,7,10,11],policy_gradients_ag:10,policygradientalgorithmparamet:10,policygradientrescal:[5,7,10,11],policyoptimizationag:38,popul:41,popular:[29,48],port:0,posit:[4,32],possibl:[2,3,4,22,30,33,37,40,43,50,51,52,53],post:[31,50],post_training_command:[3,52],power:[29,48],ppo:[7,11,51],ppo_ag:11,ppoalgorithmparamet:11,pre:[8,13,30,31],predefin:[14,23,30,53],predict:[1,2,3,5,6,7,8,11,12,13,14,15,16,17,23,24,25,26,30,43,51,52],prediction_typ:[3,52],predictiontyp:[3,52],prefect:51,prefer:26,prefix:[3,52],prep:44,prepar:[3,52],prepare_batch_for_infer:[3,52],present:[18,22,26,29,32,51],preset:[0,5,38,39,41,42,44,53],press:[40,53],prevent:[8,11,13,41],previou:32,previous:[11,26],print:[0,3,53],print_networks_summari:0,priorit:[25,34],prioriti:[25,34],privat:37,probabilit:[5,6],probabl:[3,5,6,10,14,15,25,27,30,38,51,52],problem:51,procedur:6,process:[0,3,8,9,26,30,31,32,33,38,40,41,43,46,47,50,52],produc:26,progress:26,project:[15,25],propag:7,propagate_updates_to_dnd:22,properti:[3,26,27,29,34,38,39,44,52],proport:34,provid:[26,42],proxi:41,proxim:3,pub:[35,36,44],publish:47,purpos:[0,3,10],pursuit:2,push:[3,52],pybullet:[29,48],pygam:[0,44],pytest:44,python3:44,python:[29,34,38,44,48,50],q_i:12,qr_dqn_agent:24,quad:6,qualiti:29,quantil:[3,51],quantileregressiondqnalgorithmparamet:24,queri:[22,26,41,51],question:51,quit:[40,46],r_i:[5,20],r_t:[4,6,7,25],rainbow:[3,38,51],rainbow_ag:38,rainbow_dqn_ag:25,rainbowag:38,rainbowagentparamet:38,rainbowalgorithmparamet:38,rainbowdqnalgorithmparamet:25,rainbowexplorationparamet:38,rainbowmemoryparamet:38,rainbownetworkparamet:38,rais:[3,27,52],ramp:[38,41],random:[0,20,29,30,37,41,47],random_initialization_step:29,randomli:[27,41],rang:[4,7,8,11,13,15,25,29,32,33,37,51],rare:22,rate:[0,6,19,22,26,29,43],rate_for_copying_weights_to_target:[6,8,12,13],rather:[4,12,40],ratio:[6,7,11,19,32],ratio_of_replai:6,raw:[29,48],reach:[0,11,37],read:[0,28],read_csv_tri:0,readabl:41,readm:44,real:3,reason:[32,47],rebuild_on_every_upd:34,receiv:[26,27],recent:[3,25,26,51,52],recommend:39,redi:[35,36,44],redispubsub:44,redispubsubmemorybackendparamet:35,reduc:[1,2,10,11,23,26,32,41,51],reduct:32,reduction_method:32,reductionmethod:32,redund:32,refer:[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,42,44],referenc:3,regard:[3,52],region:[6,51],regist:[3,52],register_sign:[3,52],registri:44,regress:[2,3,51],regula:[6,7,11],regular:[5,7,10,11,20,22,26,30,33,34,51],regularli:26,reinforc:[3,5,8,9,10,12,15,16,17,18,20,23,24,25,29,30,40,41,43,45,47,48,49,51],relat:[26,44],relationship:51,releas:[0,50,51],relev:[3,14,30,32,52],remov:[0,32],render:[0,3,29,39],reorder:32,repeat:[29,41],replac:[30,32,34,44],replace_mse_with_huber_loss:26,replai:[1,2,3,6,8,12,13,14,15,16,17,20,22,23,24,25,34,41,51,52,53],replay_buff:53,replicated_devic:26,repo:39,repositori:50,repres:[0,7,11,15,25,26,27,29,30,33,37,53],represent:43,reproduc:[41,47],request:[3,26,52],requir:[3,26,28,30,32,40,43,44,51,52],requires_action_valu:30,rescal:[4,5,7,10,11,26,31,32],rescale_factor:32,research:[29,47,48],reset:[3,22,26,29,30,39,52],reset_accumulated_gradi:26,reset_evaluation_st:[3,52],reset_gradi:26,reset_internal_st:[3,29,52],resourc:[42,44],respect:[8,13,27,29],respons:[3,27,29,41,52],rest:[26,27,33,44],restart:39,restor:[0,3,52],restore_checkpoint:[3,52],result:[3,4,13,15,16,17,18,24,25,26,32,33,47,51,52,53],ret:6,retrac:6,retri:0,retriev:[22,34],return_additional_data:34,reus:41,reusabl:43,reward:[0,1,2,3,4,8,10,13,19,20,25,26,27,29,31,37,39,40,41,51,52],reward_test_level:0,reward_typ:37,rgb:[29,32,37],rho:[6,8,13],rho_t:6,right:[2,3,6,12,30,33,40,51,52],rl_coach:[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,39,44,52,53],rms_prop_optimizer_decai:26,rmsprop:26,roboschool:[29,48],robot:[29,37,48,50],roboti:44,robust:52,rollout:[28,35,36,42,44,53],root:[40,44],rule:[8,13,14],run:[0,3,4,8,10,11,12,13,14,16,17,22,23,26,29,30,32,52,53],run_off_policy_evalu:[3,52],run_pre_network_filter_for_infer:[3,52],runphas:[3,52],runtim:44,rvert:[15,25],rvert_2:6,s3_bucket_nam:44,s3_creds_fil:44,s3_data_stor:28,s3_end_point:44,s3datastoreparamet:28,s_t:[4,5,6,8,12,13,14,15,16,17,19,20,21,23,25],sac:51,sai:51,same:[3,4,7,10,13,19,20,23,26,29,33,34,40,43,47,51,52],sampl:[1,2,3,5,6,8,10,11,12,13,14,15,16,17,19,20,23,24,25,26,30,34,37,41,44,52],sample_with_info:37,satur:[8,13],save:[0,3,25,26,30,44,52,53],save_checkpoint:[3,52],saver:[3,26,52],savercollect:[3,26,52],scale:[4,10,26,32,40,44,50,53],scale_down_gradients_by_number_of_workers_for_sync_train:26,scale_measurements_target:4,scaler:26,schedul:[7,30,34,41,42,44,53],scheme:[5,30,41,51],schulman:11,sci:44,scienc:47,scipi:[32,44],scope:26,scratch:51,scratchpad:0,screen:[3,29,39,53],screen_siz:29,script:41,second:[0,26,40,51,53],section:[44,45,48],see:[3,29,32,44,47,48,51,52,53],seed:[0,29,47],seen:[4,22,23,29,32,41,47,51],segment:[29,37],select:[5,14,22,26,27,30,32,33,37,39,40,41,50,53],self:[3,26,38,39,52],send:[39,43],separ:[0,3,18,32,33,43,45,46,51],separate_actions_for_throttle_and_brak:29,seper:10,sequenti:[4,27,34],serv:[7,10,43],server:0,server_height:29,server_width:29,sess:[3,26,52],session:[3,26,52],set:[0,2,3,4,5,6,7,8,11,13,15,16,17,19,22,23,25,26,27,29,30,32,33,37,38,42,47,48,50,51,52,53],set_environment_paramet:[3,52],set_goal:29,set_incoming_direct:[3,52],set_is_train:26,set_sess:[3,52],set_variable_valu:26,set_weight:26,setup:[3,44,46,52],setup_logg:[3,52],setuptool:44,sever:[0,3,7,10,11,14,26,29,30,32,38,39,40,41,43,48,51,52,53],shape:[26,32,37],share:[0,3,26,34,43,52],shared_memory_scratchpad:0,shared_optim:26,shift:[33,41],shine:40,should:[0,3,4,7,11,14,20,23,26,27,29,32,34,37,38,39,42,52,53],should_dump:0,shouldn:14,show:47,shown:47,shuffl:[3,27,52],side:[3,52],sigma:[13,30],signal:[3,41,52],signal_nam:[3,52],significantli:18,sim:[6,12],similar:[7,18,20,27,29,33,51],simpl:[10,34,38,39,43,50,51,53],simplest:51,simplif:51,simplifi:[7,40,43],simul:[29,39,46,48,53],simultan:7,sinc:[3,7,8,10,13,20,22,23,25,26,30,32,46,52],singl:[3,4,5,6,7,11,14,18,19,20,26,27,29,30,33,37,40,41,43,52],size:[26,27,30,32,33,34,37],skill:51,skip:[29,41],slave:[3,52],slice:27,slow:[26,51,53],slower:[0,13,18,26],slowli:[8,13],small:[7,13,22,34],smaller:30,smooth:[40,51],soft:[3,8,11,13,21,51],soft_actor_critic_ag:12,softactorcriticalgorithmparamet:12,softmax:[26,30],softmax_temperatur:26,softwar:44,sole:46,solut:51,solv:[32,39,48,50],some:[0,3,11,26,27,30,32,38,39,40,43,46,47,51,52,53],sort:24,sourc:[0,1,2,3,4,5,6,7,8,10,11,12,13,15,17,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,39,44,48,52],space:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,29,30,31,32,33,34,41,50,52],spacesdefinit:[3,26,52],spatial:51,spawn:[42,44],special:18,specif:[0,3,14,18,22,26,27,38,41,53],specifi:[0,26,29,30,32,39,42,53],speed:[26,32,51],speedup:53,spread:[32,33],squar:32,squeeze_list:26,squeeze_output:26,src:44,stabil:[6,20,26,51],stabl:[43,51],stack:[3,31,32,37,52],stack_siz:[26,32],stacking_axi:32,stage:43,stai:47,standard:[7,10,11,14,30,32,40,46],starcraft2_environ:29,starcraft2environ:29,starcraft:[37,48],starcraftobservationtyp:29,start:[3,6,8,11,12,13,18,23,27,32,33,39,44,52],state:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24,25,26,27,29,32,34,37,38,39,41,43,45,51,52],state_key_with_the_class_index:[2,34],state_spac:29,state_valu:27,statist:[3,10,32,50,52],std:12,stdev:30,steep:30,step:[0,3,4,5,6,7,8,10,11,12,13,14,15,16,17,19,21,22,23,24,25,26,27,29,30,32,38,39,40,41,51,52,53],stepmethod:[8,12,13,20],stochast:[12,41,51],stop:[0,29],store:[0,3,22,25,27,29,32,34,40,41,42,44,50,52,53],store_transitions_only_when_episodes_are_termin:25,str:[0,2,3,4,20,26,27,29,30,32,33,37,52],strategi:[29,48],stream:[18,42],strict:47,string:[0,26,29],structur:[0,3,27,34,38,41,52],stuff:26,style:30,sub:[33,34,35,36,37,38,41,44,53],sub_spac:37,subset:[40,47,51],subtract:23,succeed:29,success:[0,29,51],suffer:40,suffici:27,suffix:[3,26,52],suggest:38,suit:[0,48],suitabl:[42,53],sum:[4,7,10,19,26,27],sum_:[5,12,15,19,20,22,25],summari:[0,3,52],supervis:51,suppli:[3,52],support:[0,3,26,29,30,40,43,44,45,46,48,50,53],sure:[0,3,44,47,52],surrog:7,swig:44,swingup:29,symbol:26,sync:[3,26,41,42,52],synchron:[0,26,41,43],system:46,t_max:[10,20],tag:44,take:[0,3,10,11,18,22,23,26,29,30,31,39,40,41,52],taken:[1,2,4,5,6,7,8,11,12,13,15,18,22,23,24,25,26,27,29,30],tanh:[8,13],tar:44,target:[0,1,2,3,4,5,6,7,8,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,29,32,33,37,38,41,43,52],target_act:33,target_kl_diverg:11,target_network:26,target_success_r:29,targets_horizon:20,task:[0,1,2,29,32,38,40,48],task_index:0,tau:12,td3:51,td3_agent:13,td3algorithmparamet:13,techniqu:[7,11,50,51],technolog:42,teh:26,temperatur:[26,30],temperature_schedul:30,tensor:[3,26,52],tensorboard:0,tensorflow:[0,3,26,52,53],tensorflow_support:26,term:[6,7,11],termin:[3,8,13,27,41,52],test:[0,3,5,6,8,9,10,11,12,13,26,38,47,50,53],test_using_a_trace_test:0,text:6,textrm:41,than:[0,3,11,13,26,30,40,43,46,52],thei:[3,22,23,26,30,40,41,42,51,52,53],them:[4,5,10,20,26,27,29,32,37,39,40,43],therefor:[0,8,13,26,31,51],theta:[6,7,8,12,13,15,25,30],theta_:[6,7],thi:[0,3,4,5,6,7,8,10,11,13,14,18,20,22,25,26,27,29,30,31,32,33,34,35,37,38,39,40,41,42,43,44,46,47,51,52,53],thing:[40,46],those:[0,3,8,13,14,16,17,18,22,27,30,33,41,43,45,51,52],thousand:[11,14,15,16,17,19,23,24,25],thread:26,three:[3,42,43,44,45],threshold:[11,22,32],through:[0,3,4,8,9,10,11,13,14,22,23,26,38,39,41,43,52],tild:[8,12,13],time:[0,4,23,26,30,33,34,40,43,51],time_limit:39,timestep:[4,10],timid:44,tmp:0,togeth:[3,20,27,41,52],toggl:40,too:11,tool:[40,44,51],top:[26,29,31,32,34,39,40,51],torqu:29,total:[0,3,10,11,19,22,23,27,34,38,40,51,52],total_loss:26,total_return:27,trace:0,trace_max_env_step:0,trace_test_level:0,tradeoff:30,train:[0,3,18,26,30,35,36,38,39,40,41,42,43,46,47,50,51,52],train_and_sync_network:26,train_on_batch:26,train_to_eval_ratio:34,trainer:[28,42],transfer:[29,35,48],transit:[1,2,3,4,5,6,8,10,11,12,13,15,16,17,20,22,23,24,25,34,38,41,42,52],transition_idx:27,tri:51,trick:47,tricki:40,trigger:[29,44],truncat:6,truncated_norm:30,trust:[6,51],ttf2:44,tune:30,tupl:[1,2,3,8,13,26,27,29,34,37,38],turn:[2,51],tutori:[38,39,46],tweak:[3,52],twin:3,two:[8,10,13,20,26,29,30,31,32,33,37,39,42,43,51,53],txt:44,type:[0,3,10,18,26,29,32,37,38,41,43,50,51,52,53],typic:[7,11,26,51,53],ubuntu16:44,uhlenbeck:[8,9,30],uint8:32,unbound:37,uncertain:30,uncertainti:30,unchang:11,unclip:[3,38,52],uncorrel:20,undeploi:42,under:[3,26,38,53],underbrac:5,understand:53,unifi:7,uniformli:[29,30,33,37],union:[3,27,29,30,33,37,52],uniqu:26,unit:40,unlik:11,unmask:33,unnecessari:0,unshar:[3,52],unsign:32,unspecifi:26,unstabl:[40,47],until:[0,6,10,11,22,25,30],unus:26,unzip:44,updat:[3,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,30,38,39,40,41,43,44,51,52],update_discounted_reward:27,update_filter_internal_st:[3,52],update_log:[3,52],update_online_network:26,update_step_in_episode_log:[3,52],update_target_network:26,update_transition_before_adding_to_replay_buff:[3,52],upgrad:44,upon:[3,5,38,52],upper:[6,30],usag:[33,46,50],use:[0,1,2,3,4,5,6,8,9,10,12,13,14,16,17,21,26,27,28,29,30,32,33,34,37,38,39,41,43,44,46,50,51,52,53],use_accumulated_reward_as_measur:4,use_cpu:0,use_deterministic_for_evalu:12,use_full_action_spac:29,use_inputs_for_apply_gradi:26,use_kl_regular:[7,11],use_non_zero_discount_for_terminal_st:[8,13],use_separate_networks_per_head:26,use_target_network_for_evalu:[8,13],use_trust_region_optim:6,used:[0,2,3,5,6,7,8,10,11,12,13,14,15,19,20,21,22,23,24,26,29,30,32,33,34,35,36,38,39,41,42,43,46,47,52,53],useful:[0,3,4,25,26,30,32,37,47,51,52,53],user:[26,29,30,40,41,44],userguid:44,uses:[0,1,7,11,18,27,28,30,36,41,42,44,47,51,53],using:[0,3,5,6,7,8,10,11,12,13,16,17,19,20,21,22,23,25,26,28,29,30,32,35,38,39,40,42,46,48,51,52,53],usr:44,usual:[32,41],util:[3,40,52],v_max:15,v_min:15,val:[3,37,52],valid:[0,37],valu:[0,2,3,4,5,6,7,8,11,12,13,14,15,16,17,18,20,21,22,23,25,26,27,29,30,32,33,34,37,38,41,43,44,45,51,52],valuabl:40,value_targets_mix_fract:[7,11],valueexcept:[3,52],valueoptimizationag:38,van:4,vari:43,variabl:[26,29,44],variable_scop:26,varianc:[10,30,40,51],variant:[30,34,51],variou:[3,27,34,50],vector:[3,4,8,9,11,13,14,26,29,32,37,39,43,51,52],vectorobservationspac:32,verbos:29,veri:[0,7,8,10,13,18,22,40,51,53],version:[7,11,27],versu:26,vert:12,vertic:26,via:[2,14],video:[0,3,29],video_dump_method:0,view:40,viewabl:[3,52],visit:47,visual:[0,3,29,48,50],visualization_paramet:29,visualizationparamet:[3,29],vizdoom:[44,48],vote:30,wai:[3,7,11,30,33,39,41,43,50,51,52,53],wait:[5,26,42],walk:39,want:[3,4,25,26,32,33,34,46,52],warn:[30,32,33],wasn:27,weather_id:29,websit:[29,50],weight:[4,5,6,7,8,11,12,13,14,15,16,17,19,20,21,22,23,24,25,26,30,41,43,51],well:[22,26,30,37,51],went:11,were:[4,15,16,17,18,22,24,25,26,27,33,47],west:44,wget:44,what:[11,46,51],whatev:[3,52],when:[0,3,4,5,6,7,8,9,10,11,12,13,22,26,27,28,29,30,32,35,36,38,39,40,52,53],whenev:42,where:[2,3,4,5,6,7,11,14,15,18,20,22,23,25,26,27,29,30,32,33,37,40,46,51,52],whether:30,which:[0,1,2,3,5,6,7,8,10,11,12,13,14,18,20,21,22,23,24,26,27,28,29,30,32,34,35,36,37,38,39,40,41,42,43,45,46,47,48,50,51,52,53],who:41,why:[40,41],window:[32,33],wise:32,within:[0,7,11,21,30,37,40],without:[5,11,33,34,40,51,53],won:[4,26],wont:26,work:[3,20,26,30,32,33,40,41,51,52,53],workaround:0,workdir:44,worker:[0,20,26,28,32,34,35,36,40,42,43,44,51,53],worker_devic:26,worker_host:0,wors:51,would:[26,44,46,51],wrap:[29,32,41,48],wrapper:[3,26,27,29,37,43,52],write:[0,3,52],written:[3,25,28,52],www:44,xdist:44,y_t:[8,12,13,14,16,17,19,21,22,23],year:51,yet:[18,39],you:[4,32,34,38,39,44,50,53],your:[38,39,44,53],yuv:32,z_i:[15,25],z_j:[15,25],zero:[2,13,16,17],zip:44,zlib1g:44},titles:["Additional Parameters","Behavioral Cloning","Conditional Imitation Learning","Agents","Direct Future Prediction","Actor-Critic","ACER","Clipped Proximal Policy Optimization","Deep Deterministic Policy Gradient","Hierarchical Actor Critic","Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Twin Delayed Deep Deterministic Policy Gradient","Bootstrapped DQN","Categorical DQN","Double DQN","Deep Q Networks","Dueling DQN","Mixed Monte Carlo","N-Step Q Learning","Normalized Advantage Functions","Neural Episodic Control","Persistent Advantage Learning","Quantile Regression DQN","Rainbow","Architectures","Core Types","Data Stores","Environments","Exploration Policies","Filters","Input Filters","Output Filters","Memories","Memory Backends","Orchestrators","Spaces","Adding a New Agent","Adding a New Environment","Coach Dashboard","Control Flow","Distributed Coach - Horizontal Scale-Out","Network Design","Usage - Distributed Coach","Algorithms","Batch Reinforcement Learning","Benchmarks","Environments","Features","Reinforcement Learning Coach","Selecting an Algorithm","test","Usage"],titleterms:{"final":22,"function":21,"new":[38,39],"switch":53,Adding:[38,39],Using:39,acer:6,across:51,action:[4,5,6,7,8,9,10,11,12,13,14,21,22,33,37,51],actioninfo:27,actor:[5,9,12],addit:[0,53],additivenois:30,advantag:[21,23],agent:[3,38,41,53],algorithm:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,45,51,53],api:39,architectur:26,attentionactionspac:37,backend:35,balancedexperiencereplai:34,batch:[27,46],behavior:1,benchmark:47,between:53,blizzard:29,boltzmann:30,bootstrap:[14,30],boxactionspac:37,build:44,can:51,carla:29,carlo:19,categor:[15,30],choos:[4,5,6,7,8,9,10,11,12,13,14,21,22],clip:7,clone:[1,44],coach:[39,40,42,44,50],collect:51,compar:40,compoundactionspac:37,condit:2,config:44,contain:44,continu:[7,11,12,51],continuousentropi:30,control:[22,29,41],copi:43,core:27,creat:44,critic:[5,9,12],dashboard:40,data:28,deep:[8,13,17,53],deepmind:29,delai:13,demonstr:51,descript:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],design:43,determinist:[8,13],direct:4,discret:[5,6,10,51],discreteactionspac:37,distribut:[42,44],distributedtaskparamet:0,doe:51,doubl:16,dqn:[14,15,16,18,24],duel:18,dump:53,egreedi:30,environ:[29,39,48,51,53],envrespons:27,episod:[22,27,34],episodicexperiencereplai:34,episodichindsightexperiencereplai:34,episodichrlhindsightexperiencereplai:34,evalu:53,experiencereplai:34,explor:30,explorationpolici:30,featur:49,file:44,filter:[31,32,33],flag:53,flow:41,framework:53,from:51,futur:4,gener:18,gif:53,goal:37,gradient:[8,10,13],graph:41,greedi:30,gym:[29,39],have:51,hierarch:9,horizont:42,human:[51,53],imag:44,imageobservationspac:37,imit:[2,53],implement:44,input:32,interfac:44,keep:43,kubernet:36,learn:[2,20,23,46,50,53],level:41,manag:41,memori:[34,35],mix:19,mont:19,more:51,multi:53,multipl:51,multiselectactionspac:37,network:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,43],networkwrapp:26,neural:22,nfsdatastor:28,node:[51,53],non:34,normal:21,observ:[32,37],observationclippingfilt:32,observationcropfilt:32,observationmoveaxisfilt:32,observationnormalizationfilt:32,observationreductionbysubpartsnamefilt:32,observationrescalesizebyfactorfilt:32,observationrescaletosizefilt:32,observationrgbtoyfilt:32,observationsqueezefilt:32,observationstackingfilt:32,observationtouint8filt:32,openai:[29,39],optim:[7,11],orchestr:36,ouprocess:30,out:42,output:33,pain:51,parallel:51,paramet:0,parameternois:30,persist:23,plai:53,planarmapsobservationspac:37,polici:[7,8,10,11,13,30],predict:4,prerequisit:44,presetvalidationparamet:0,prioritizedexperiencereplai:34,process:51,proxim:[7,11],push:44,qdnd:34,quantil:24,rainbow:25,redispubsubbackend:35,regress:24,reinforc:[46,50],render:53,repositori:44,reward:32,rewardclippingfilt:32,rewardnormalizationfilt:32,rewardrescalefilt:32,run:[40,44],s3datastor:28,sampl:51,scale:42,select:51,signal:40,simul:51,singl:53,singleepisodebuff:34,soft:12,solv:51,space:[37,51],starcraft:29,statist:40,step:20,store:[14,28],structur:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25],suit:29,support:42,sync:43,synchron:42,task:51,taskparamet:0,test:52,thread:53,through:53,track:40,train:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24,25,53],transit:[14,27],transitioncollect:34,truncatednorm:30,twin:13,type:[27,42],ucb:30,usag:[44,53],vectorobservationspac:37,visual:[40,53],visualizationparamet:0,vizdoom:29,you:51,your:51}}) \ No newline at end of file +Search.setIndex({docnames:["components/additional_parameters","components/agents/imitation/bc","components/agents/imitation/cil","components/agents/index","components/agents/other/dfp","components/agents/policy_optimization/ac","components/agents/policy_optimization/acer","components/agents/policy_optimization/cppo","components/agents/policy_optimization/ddpg","components/agents/policy_optimization/hac","components/agents/policy_optimization/pg","components/agents/policy_optimization/ppo","components/agents/policy_optimization/sac","components/agents/policy_optimization/td3","components/agents/policy_optimization/wolpertinger","components/agents/value_optimization/bs_dqn","components/agents/value_optimization/categorical_dqn","components/agents/value_optimization/double_dqn","components/agents/value_optimization/dqn","components/agents/value_optimization/dueling_dqn","components/agents/value_optimization/mmc","components/agents/value_optimization/n_step","components/agents/value_optimization/naf","components/agents/value_optimization/nec","components/agents/value_optimization/pal","components/agents/value_optimization/qr_dqn","components/agents/value_optimization/rainbow","components/architectures/index","components/core_types","components/data_stores/index","components/environments/index","components/exploration_policies/index","components/filters/index","components/filters/input_filters","components/filters/output_filters","components/memories/index","components/memory_backends/index","components/orchestrators/index","components/spaces","contributing/add_agent","contributing/add_env","dashboard","design/control_flow","design/horizontal_scaling","design/network","dist_usage","features/algorithms","features/batch_rl","features/benchmarks","features/environments","features/index","index","selecting_an_algorithm","test","usage"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["components/additional_parameters.rst","components/agents/imitation/bc.rst","components/agents/imitation/cil.rst","components/agents/index.rst","components/agents/other/dfp.rst","components/agents/policy_optimization/ac.rst","components/agents/policy_optimization/acer.rst","components/agents/policy_optimization/cppo.rst","components/agents/policy_optimization/ddpg.rst","components/agents/policy_optimization/hac.rst","components/agents/policy_optimization/pg.rst","components/agents/policy_optimization/ppo.rst","components/agents/policy_optimization/sac.rst","components/agents/policy_optimization/td3.rst","components/agents/policy_optimization/wolpertinger.rst","components/agents/value_optimization/bs_dqn.rst","components/agents/value_optimization/categorical_dqn.rst","components/agents/value_optimization/double_dqn.rst","components/agents/value_optimization/dqn.rst","components/agents/value_optimization/dueling_dqn.rst","components/agents/value_optimization/mmc.rst","components/agents/value_optimization/n_step.rst","components/agents/value_optimization/naf.rst","components/agents/value_optimization/nec.rst","components/agents/value_optimization/pal.rst","components/agents/value_optimization/qr_dqn.rst","components/agents/value_optimization/rainbow.rst","components/architectures/index.rst","components/core_types.rst","components/data_stores/index.rst","components/environments/index.rst","components/exploration_policies/index.rst","components/filters/index.rst","components/filters/input_filters.rst","components/filters/output_filters.rst","components/memories/index.rst","components/memory_backends/index.rst","components/orchestrators/index.rst","components/spaces.rst","contributing/add_agent.rst","contributing/add_env.rst","dashboard.rst","design/control_flow.rst","design/horizontal_scaling.rst","design/network.rst","dist_usage.rst","features/algorithms.rst","features/batch_rl.rst","features/benchmarks.rst","features/environments.rst","features/index.rst","index.rst","selecting_an_algorithm.rst","test.rst","usage.rst"],objects:{"rl_coach.agents.acer_agent":{ACERAlgorithmParameters:[6,0,1,""]},"rl_coach.agents.actor_critic_agent":{ActorCriticAlgorithmParameters:[5,0,1,""]},"rl_coach.agents.agent":{Agent:[3,0,1,""]},"rl_coach.agents.agent.Agent":{act:[3,1,1,""],call_memory:[3,1,1,""],choose_action:[3,1,1,""],collect_savers:[3,1,1,""],create_networks:[3,1,1,""],freeze_memory:[3,1,1,""],get_predictions:[3,1,1,""],get_state_embedding:[3,1,1,""],handle_episode_ended:[3,1,1,""],init_environment_dependent_modules:[3,1,1,""],initialize_session_dependent_components:[3,1,1,""],learn_from_batch:[3,1,1,""],load_memory_from_file:[3,1,1,""],log_to_screen:[3,1,1,""],observe:[3,1,1,""],parent:[3,1,1,""],phase:[3,1,1,""],post_training_commands:[3,1,1,""],prepare_batch_for_inference:[3,1,1,""],register_signal:[3,1,1,""],reset_evaluation_state:[3,1,1,""],reset_internal_state:[3,1,1,""],restore_checkpoint:[3,1,1,""],run_off_policy_evaluation:[3,1,1,""],run_pre_network_filter_for_inference:[3,1,1,""],save_checkpoint:[3,1,1,""],set_environment_parameters:[3,1,1,""],set_incoming_directive:[3,1,1,""],set_session:[3,1,1,""],setup_logger:[3,1,1,""],sync:[3,1,1,""],train:[3,1,1,""],update_log:[3,1,1,""],update_step_in_episode_log:[3,1,1,""],update_transition_before_adding_to_replay_buffer:[3,1,1,""]},"rl_coach.agents.bc_agent":{BCAlgorithmParameters:[1,0,1,""]},"rl_coach.agents.categorical_dqn_agent":{CategoricalDQNAlgorithmParameters:[16,0,1,""]},"rl_coach.agents.cil_agent":{CILAlgorithmParameters:[2,0,1,""]},"rl_coach.agents.clipped_ppo_agent":{ClippedPPOAlgorithmParameters:[7,0,1,""]},"rl_coach.agents.ddpg_agent":{DDPGAlgorithmParameters:[8,0,1,""]},"rl_coach.agents.dfp_agent":{DFPAlgorithmParameters:[4,0,1,""]},"rl_coach.agents.dqn_agent":{DQNAgent:[53,0,1,""],DQNAlgorithmParameters:[18,0,1,""]},"rl_coach.agents.dqn_agent.DQNAgent":{act:[53,1,1,""],call_memory:[53,1,1,""],choose_action:[53,1,1,""],collect_savers:[53,1,1,""],create_networks:[53,1,1,""],freeze_memory:[53,1,1,""],get_predictions:[53,1,1,""],get_state_embedding:[53,1,1,""],handle_episode_ended:[53,1,1,""],improve_reward_model:[53,1,1,""],init_environment_dependent_modules:[53,1,1,""],initialize_session_dependent_components:[53,1,1,""],learn_from_batch:[53,1,1,""],load_memory_from_file:[53,1,1,""],log_to_screen:[53,1,1,""],observe:[53,1,1,""],parent:[53,1,1,""],phase:[53,1,1,""],post_training_commands:[53,1,1,""],prepare_batch_for_inference:[53,1,1,""],register_signal:[53,1,1,""],reset_evaluation_state:[53,1,1,""],reset_internal_state:[53,1,1,""],restore_checkpoint:[53,1,1,""],run_off_policy_evaluation:[53,1,1,""],run_pre_network_filter_for_inference:[53,1,1,""],save_checkpoint:[53,1,1,""],set_environment_parameters:[53,1,1,""],set_incoming_directive:[53,1,1,""],set_session:[53,1,1,""],setup_logger:[53,1,1,""],sync:[53,1,1,""],train:[53,1,1,""],update_log:[53,1,1,""],update_step_in_episode_log:[53,1,1,""],update_transition_before_adding_to_replay_buffer:[53,1,1,""]},"rl_coach.agents.mmc_agent":{MixedMonteCarloAlgorithmParameters:[20,0,1,""]},"rl_coach.agents.n_step_q_agent":{NStepQAlgorithmParameters:[21,0,1,""]},"rl_coach.agents.naf_agent":{NAFAlgorithmParameters:[22,0,1,""]},"rl_coach.agents.nec_agent":{NECAlgorithmParameters:[23,0,1,""]},"rl_coach.agents.pal_agent":{PALAlgorithmParameters:[24,0,1,""]},"rl_coach.agents.policy_gradients_agent":{PolicyGradientAlgorithmParameters:[10,0,1,""]},"rl_coach.agents.ppo_agent":{PPOAlgorithmParameters:[11,0,1,""]},"rl_coach.agents.qr_dqn_agent":{QuantileRegressionDQNAlgorithmParameters:[25,0,1,""]},"rl_coach.agents.rainbow_dqn_agent":{RainbowDQNAlgorithmParameters:[26,0,1,""]},"rl_coach.agents.soft_actor_critic_agent":{SoftActorCriticAlgorithmParameters:[12,0,1,""]},"rl_coach.agents.td3_agent":{TD3AlgorithmParameters:[13,0,1,""]},"rl_coach.agents.wolpertinger_agent":{WolpertingerAlgorithmParameters:[14,0,1,""]},"rl_coach.architectures.architecture":{Architecture:[27,0,1,""]},"rl_coach.architectures.architecture.Architecture":{accumulate_gradients:[27,1,1,""],apply_and_reset_gradients:[27,1,1,""],apply_gradients:[27,1,1,""],collect_savers:[27,1,1,""],construct:[27,1,1,""],get_variable_value:[27,1,1,""],get_weights:[27,1,1,""],parallel_predict:[27,1,1,""],predict:[27,1,1,""],reset_accumulated_gradients:[27,1,1,""],set_variable_value:[27,1,1,""],set_weights:[27,1,1,""],train_on_batch:[27,1,1,""]},"rl_coach.architectures.network_wrapper":{NetworkWrapper:[27,0,1,""]},"rl_coach.architectures.network_wrapper.NetworkWrapper":{apply_gradients_and_sync_networks:[27,1,1,""],apply_gradients_to_global_network:[27,1,1,""],apply_gradients_to_online_network:[27,1,1,""],collect_savers:[27,1,1,""],parallel_prediction:[27,1,1,""],set_is_training:[27,1,1,""],sync:[27,1,1,""],train_and_sync_networks:[27,1,1,""],update_online_network:[27,1,1,""],update_target_network:[27,1,1,""]},"rl_coach.base_parameters":{AgentParameters:[3,0,1,""],DistributedTaskParameters:[0,0,1,""],NetworkParameters:[27,0,1,""],PresetValidationParameters:[0,0,1,""],TaskParameters:[0,0,1,""],VisualizationParameters:[0,0,1,""]},"rl_coach.core_types":{ActionInfo:[28,0,1,""],Batch:[28,0,1,""],EnvResponse:[28,0,1,""],Episode:[28,0,1,""],Transition:[28,0,1,""]},"rl_coach.core_types.Batch":{actions:[28,1,1,""],game_overs:[28,1,1,""],goals:[28,1,1,""],info:[28,1,1,""],info_as_list:[28,1,1,""],n_step_discounted_rewards:[28,1,1,""],next_states:[28,1,1,""],rewards:[28,1,1,""],shuffle:[28,1,1,""],size:[28,1,1,""],slice:[28,1,1,""],states:[28,1,1,""]},"rl_coach.core_types.Episode":{get_first_transition:[28,1,1,""],get_last_transition:[28,1,1,""],get_transition:[28,1,1,""],get_transitions_attribute:[28,1,1,""],insert:[28,1,1,""],is_empty:[28,1,1,""],length:[28,1,1,""],update_discounted_rewards:[28,1,1,""]},"rl_coach.data_stores.nfs_data_store":{NFSDataStore:[29,0,1,""]},"rl_coach.data_stores.s3_data_store":{S3DataStore:[29,0,1,""]},"rl_coach.environments.carla_environment":{CarlaEnvironment:[30,0,1,""]},"rl_coach.environments.control_suite_environment":{ControlSuiteEnvironment:[30,0,1,""]},"rl_coach.environments.doom_environment":{DoomEnvironment:[30,0,1,""]},"rl_coach.environments.environment":{Environment:[30,0,1,""]},"rl_coach.environments.environment.Environment":{action_space:[30,1,1,""],close:[30,1,1,""],get_action_from_user:[30,1,1,""],get_available_keys:[30,1,1,""],get_goal:[30,1,1,""],get_random_action:[30,1,1,""],get_rendered_image:[30,1,1,""],goal_space:[30,1,1,""],handle_episode_ended:[30,1,1,""],last_env_response:[30,1,1,""],phase:[30,1,1,""],render:[30,1,1,""],reset_internal_state:[30,1,1,""],set_goal:[30,1,1,""],state_space:[30,1,1,""],step:[30,1,1,""]},"rl_coach.environments.gym_environment":{GymEnvironment:[30,0,1,""]},"rl_coach.environments.starcraft2_environment":{StarCraft2Environment:[30,0,1,""]},"rl_coach.exploration_policies.additive_noise":{AdditiveNoise:[31,0,1,""]},"rl_coach.exploration_policies.boltzmann":{Boltzmann:[31,0,1,""]},"rl_coach.exploration_policies.bootstrapped":{Bootstrapped:[31,0,1,""]},"rl_coach.exploration_policies.categorical":{Categorical:[31,0,1,""]},"rl_coach.exploration_policies.continuous_entropy":{ContinuousEntropy:[31,0,1,""]},"rl_coach.exploration_policies.e_greedy":{EGreedy:[31,0,1,""]},"rl_coach.exploration_policies.exploration_policy":{ExplorationPolicy:[31,0,1,""]},"rl_coach.exploration_policies.exploration_policy.ExplorationPolicy":{change_phase:[31,1,1,""],get_action:[31,1,1,""],requires_action_values:[31,1,1,""],reset:[31,1,1,""]},"rl_coach.exploration_policies.greedy":{Greedy:[31,0,1,""]},"rl_coach.exploration_policies.ou_process":{OUProcess:[31,0,1,""]},"rl_coach.exploration_policies.parameter_noise":{ParameterNoise:[31,0,1,""]},"rl_coach.exploration_policies.truncated_normal":{TruncatedNormal:[31,0,1,""]},"rl_coach.exploration_policies.ucb":{UCB:[31,0,1,""]},"rl_coach.filters.action":{AttentionDiscretization:[34,0,1,""],BoxDiscretization:[34,0,1,""],BoxMasking:[34,0,1,""],FullDiscreteActionSpaceMap:[34,0,1,""],LinearBoxToBoxMap:[34,0,1,""],PartialDiscreteActionSpaceMap:[34,0,1,""]},"rl_coach.filters.observation":{ObservationClippingFilter:[33,0,1,""],ObservationCropFilter:[33,0,1,""],ObservationMoveAxisFilter:[33,0,1,""],ObservationNormalizationFilter:[33,0,1,""],ObservationRGBToYFilter:[33,0,1,""],ObservationReductionBySubPartsNameFilter:[33,0,1,""],ObservationRescaleSizeByFactorFilter:[33,0,1,""],ObservationRescaleToSizeFilter:[33,0,1,""],ObservationSqueezeFilter:[33,0,1,""],ObservationStackingFilter:[33,0,1,""],ObservationToUInt8Filter:[33,0,1,""]},"rl_coach.filters.reward":{RewardClippingFilter:[33,0,1,""],RewardNormalizationFilter:[33,0,1,""],RewardRescaleFilter:[33,0,1,""]},"rl_coach.memories.backend.redis":{RedisPubSubBackend:[36,0,1,""]},"rl_coach.memories.episodic":{EpisodicExperienceReplay:[35,0,1,""],EpisodicHRLHindsightExperienceReplay:[35,0,1,""],EpisodicHindsightExperienceReplay:[35,0,1,""],SingleEpisodeBuffer:[35,0,1,""]},"rl_coach.memories.non_episodic":{BalancedExperienceReplay:[35,0,1,""],ExperienceReplay:[35,0,1,""],PrioritizedExperienceReplay:[35,0,1,""],QDND:[35,0,1,""],TransitionCollection:[35,0,1,""]},"rl_coach.orchestrators.kubernetes_orchestrator":{Kubernetes:[37,0,1,""]},"rl_coach.spaces":{ActionSpace:[38,0,1,""],AttentionActionSpace:[38,0,1,""],BoxActionSpace:[38,0,1,""],CompoundActionSpace:[38,0,1,""],DiscreteActionSpace:[38,0,1,""],GoalsSpace:[38,0,1,""],ImageObservationSpace:[38,0,1,""],MultiSelectActionSpace:[38,0,1,""],ObservationSpace:[38,0,1,""],PlanarMapsObservationSpace:[38,0,1,""],Space:[38,0,1,""],VectorObservationSpace:[38,0,1,""]},"rl_coach.spaces.ActionSpace":{clip_action_to_space:[38,1,1,""],contains:[38,1,1,""],is_valid_index:[38,1,1,""],sample:[38,1,1,""],sample_with_info:[38,1,1,""]},"rl_coach.spaces.GoalsSpace":{DistanceMetric:[38,0,1,""],clip_action_to_space:[38,1,1,""],contains:[38,1,1,""],distance_from_goal:[38,1,1,""],get_reward_for_goal_and_state:[38,1,1,""],goal_from_state:[38,1,1,""],is_valid_index:[38,1,1,""],sample:[38,1,1,""],sample_with_info:[38,1,1,""]},"rl_coach.spaces.ObservationSpace":{contains:[38,1,1,""],is_valid_index:[38,1,1,""],sample:[38,1,1,""]},"rl_coach.spaces.Space":{contains:[38,1,1,""],is_valid_index:[38,1,1,""],sample:[38,1,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"]},objtypes:{"0":"py:class","1":"py:method"},terms:{"100x100":34,"160x160":33,"1_0":[16,26],"1st":31,"20x20":34,"210x160":33,"2nd":31,"50k":42,"9_amd64":45,"abstract":[39,43],"boolean":[3,28,38,53],"break":41,"case":[0,3,5,23,27,28,31,38,47,52,53,54],"class":[0,1,2,3,4,5,6,7,8,10,11,12,13,14,16,18,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,42,46,53],"default":[0,31,54],"enum":[27,30,38],"export":[0,27,45],"final":[8,13,14,17,18,20,24,42],"float":[3,4,5,6,7,8,10,11,12,13,16,20,23,24,25,27,28,30,31,33,34,35,38,39,53],"function":[0,1,3,6,7,8,11,13,27,30,31,38,39,40,42,44,53],"import":[6,19,31,35,40,52,54],"int":[0,3,4,5,6,7,10,16,21,23,25,26,28,30,31,33,34,35,38,53],"long":44,"new":[0,3,7,8,11,12,13,23,24,27,28,34,42,43,47,51,52,53],"return":[0,3,8,10,11,13,15,20,23,24,26,27,28,30,31,33,35,38,39,40,42,52,53],"short":[0,42],"static":27,"super":[39,40],"switch":[0,41],"true":[0,3,4,5,6,7,8,11,12,13,23,24,26,27,28,30,31,34,35,38,53],"try":[4,48,52],"while":[0,5,6,8,9,10,11,12,13,14,27,30,41,44,52,54],AWS:45,Adding:[19,51],And:[40,52],But:[41,52],Doing:52,For:[0,1,2,3,4,7,10,15,16,17,18,21,23,24,27,28,30,31,32,33,34,38,39,40,42,43,44,45,48,53,54],Has:27,Its:53,NFS:[29,45],Not:[],One:[25,52,54],That:41,The:[0,1,2,3,4,5,6,7,8,10,11,12,13,15,16,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,41,42,43,44,45,48,49,51,52,53,54],Then:[4,7,8,13,14,15,22,24],There:[7,11,27,31,32,39,40,44,47,54],These:[1,2,3,25,30,37,43,44,45],Use:[1,2,8,13,14,22,23],Used:31,Uses:52,Using:[8,13,14,15,17,18,45],Will:27,With:[31,51],__init__:[30,39,40],_index:[5,21],_nois:13,_render:40,_restart_environment_episod:40,_take_act:40,_update_st:40,a2c:52,a3c:[10,21,41,52],a_i:23,a_t:[4,5,6,8,12,13,14,15,16,17,18,20,21,22,24,26],a_valu:5,abl:[34,52],about:[3,28,42,53,54],abov:[8,12,13,14,27,42],abs:[21,35],absolut:31,acceler:22,accept:30,access:[27,39,45],accord:[0,3,4,5,6,8,12,13,14,15,21,27,28,31,38,41,42,44,53],accordingli:[23,38,42,54],account:[4,7,11,23,24,31],accumul:[3,4,5,6,10,21,23,26,27,33,52,53],accumulate_gradi:27,accumulated_gradi:27,accur:52,acer:[3,52],acer_ag:6,aceralgorithmparamet:6,achiev:[0,4,7,30,33,35,38,48,52,54],acquir:12,across:[10,20,41],act:[3,4,8,13,15,25,38,39,42,53],action:[1,2,3,16,17,18,19,20,21,24,25,26,27,28,30,31,32,35,39,40,42,44,53],action_idx:40,action_penalti:[8,13],action_spac:[30,31],action_space_s:27,action_valu:[28,31],actioninfo:[3,38,42,53],actionspac:[31,38],actiontyp:40,activ:[8,13,27],actor:[3,6,7,8,11,13,14,31,44,52],actor_critic_ag:5,actorcriticag:39,actorcriticalgorithmparamet:5,actual:[4,5,14,16,17,18,25,26,31,34,35],adam:[7,27],adam_optimizer_beta1:27,adam_optimizer_beta2:27,adapt:[7,11],add:[8,9,13,14,22,27,28,31,33,40,42,45,52],add_rendered_image_to_env_respons:0,added:[0,4,6,7,10,11,14,23,31,35,39],adding:[3,11,31,39,53],addit:[3,27,28,30,31,33,35,38,40,41,42,44,51,52,53],addition:[27,30,33,39,40,42,48,49,54],additional_fetch:27,additional_input:27,additional_simulator_paramet:[30,40],additionali:41,additive_nois:31,additivenoiseparamet:31,address:13,advanc:[26,51],advantag:[3,5,7,11,19,31],affect:[0,15,27],aforement:[17,18,24],after:[0,3,8,10,11,12,14,21,22,24,26,27,28,30,33,38,53,54],again:31,against:3,agent:[0,1,2,4,5,6,7,8,10,11,12,13,14,16,18,20,21,22,23,24,25,26,27,28,30,31,32,33,34,38,40,41,44,46,48,51,52,53],agent_param:43,agent_paramet:[3,27,53],agentparamet:[3,27,39],aggreg:42,ahead:[4,52],aim:31,algorithm:[3,28,31,39,41,42,43,47,48,50,51,53],algorithmparamet:[3,39],all:[0,3,10,15,23,24,27,28,30,31,33,34,38,39,40,41,42,43,44,45,47,49,53,54],all_action_prob:28,allow:[0,3,4,13,19,27,28,30,31,32,33,34,35,41,42,43,44,51,52,53,54],allow_brak:30,allow_duplicates_in_batch_sampl:35,allow_no_action_to_be_select:38,almost:47,along:[23,30,31,49],alpha:[6,20,24,35],alreadi:[23,28,40,52],also:[5,6,7,23,24,27,30,38,39,41,47,48,52,54],altern:[30,40,49],alwai:[27,31,34],amazon:45,amazonaw:45,amount:[8,10,13,20,24,31,42,52],analysi:41,analyz:41,ani:[3,27,28,30,34,35,39,42,43,44,45,47,53],anoth:[3,19,27,32,53],answer:52,anymor:[3,53],api:[30,44,49,51],appear:[3,53],appli:[0,3,5,8,10,13,14,21,27,28,31,33,52,53],applic:52,apply_and_reset_gradi:27,apply_gradi:27,apply_gradients_and_sync_network:27,apply_gradients_every_x_episod:[5,10,21],apply_gradients_to_global_network:27,apply_gradients_to_online_network:27,apply_stop_condit:0,appropri:45,approx:[8,12,13,14],approxim:[12,13,44,52],apt:45,arbitrari:33,architectur:[3,19,39,51,53],architecture_num_q_head:31,area:34,arg:[3,27,45,53],argmax_a:[17,20,24],argument:[3,16,26,27,30,38,42,53],around:[27,28,44],arrai:[3,27,28,30,33,38,40,53],art:[3,46],artifact:45,artifici:35,arxiv:[21,35],aspect:[31,33,41],assign:[0,2,5,6,27,31],assign_kl_coeffici:27,assign_op:27,assum:[28,31,33,35,52],async:[27,43],async_train:27,asynchron:[5,21,27],atari:[18,30,33,45,54],atari_a3c:54,atari_dqn:54,ath:19,atom:[16,25,26],attach:30,attempt:0,attend:34,attent:34,attentionactionspac:34,attentiondiscret:34,attribut:28,attribute_nam:28,author:[30,48,49],auto_select_all_armi:30,autoclean:45,automat:[27,54],autonom:[30,49,51],autoremov:45,auxiliari:[30,49],avail:[4,27,28,30,31,41,43,45,51,52,54],averag:[6,7,11,27,41,42],avg:6,aws:45,axes:[33,41],axi:[33,41],axis_origin:33,axis_target:33,back:[7,43],backend:[27,43,45,51,54],background:54,backpropag:23,backward:27,balanc:2,band:41,bar:6,base1:45,base64:45,base:[7,11,12,20,22,24,30,35,39,42,45,47,49,52,53],base_paramet:[0,3,27,30,31],baselin:52,basic:[10,28,43,54],batch:[1,2,3,4,5,6,8,10,11,12,13,14,15,16,17,18,19,21,24,25,26,27,35,39,42,50,51,53],batch_siz:27,batchnorm:27,bc_agent:1,bcalgorithmparamet:1,becaus:42,becom:[8,13,43],been:[19,28,33,48,52],befor:[0,3,5,11,13,26,27,28,33,42,43,44,45,52,53],begin:[0,4,42],behav:38,behavior:[3,33,35,39,48,52,53,54],being:[3,14,39,51,52,53],bellman:[16,25,26],benchmark:[41,50,51,52],benefici:47,best:[52,54],beta1:27,beta2:27,beta:[6,8,10,13,14,35],beta_entropi:[5,6,7,10,11],better:[13,19,47,52],between:[0,1,2,3,6,7,8,10,11,12,13,16,20,21,23,25,26,27,28,30,31,34,35,38,39,41,42,44,51,52],bfg:[7,11],bia:[6,52],big:[11,16,26],bin:[34,45],binari:15,bind:27,binomi:15,bit:33,blizzard:49,blob:[30,33],block:51,blog:51,boilerpl:42,bolling:41,bool:[0,3,4,5,6,7,8,11,12,13,23,24,26,27,28,30,31,35,38,53],boost:[45,52],bootstrap:[3,5,6,7,8,11,13,20,21,23,24,26,28,52],bootstrap_total_return_from_old_polici:[23,28],both:[3,7,27,30,31,34,52,53],bound:[6,7,11,16,26,31,38,52],box2d:45,box:[31,34,38],boxactionspac:34,boxdiscret:34,boxmask:34,breakout:54,breakoutdeterminist:[30,54],bring:11,bucket:45,buffer:[1,2,3,6,12,15,16,17,18,21,23,24,25,26,35,42,52,53,54],build:[32,51,52],builder:45,built:[39,42],bullet:6,button:[41,54],c51:16,cach:45,cadenc:13,calcul:[3,4,5,6,7,8,10,11,13,14,15,16,17,18,20,21,23,24,25,26,27,28,31,35,39,53],call:[0,3,10,21,27,28,30,42,53],call_memori:[3,53],callabl:38,camera:[30,40],camera_height:30,camera_width:30,cameratyp:[30,40],can:[0,2,3,5,6,7,8,11,12,13,24,27,28,30,31,32,33,34,38,39,40,41,42,44,47,49,51,53,54],candid:14,cannot:[3,47,53],carla:[33,49],carla_environ:30,carlaenviron:30,carlaenvironmentparamet:30,carlo:[3,24],cartpol:[30,40],cartpole_a3c:54,cartpole_clippedppo:[45,54],cartpole_dqn:54,categor:[3,5,6,52],categori:[32,33],categorical_dqn_ag:16,categoricaldqnalgorithmparamet:16,caus:[33,41],cdot:[5,7,8,10,12,13,14,15,16,17,18,20,22,24,26],central:[27,41],certainti:31,chain:[8,13,14],challeng:42,chang:[0,3,6,7,8,11,13,15,19,21,24,31,42,45,53],change_phas:31,channel:[30,33],channels_axi:38,check:[0,3,28,38,53],checkpoint:[0,3,27,29,43,45,53,54],checkpoint_dir:[3,53],checkpoint_prefix:[3,53],checkpoint_restore_dir:[0,54],checkpoint_restore_path:0,checkpoint_save_dir:0,checkpoint_save_sec:0,child:27,chmod:45,choic:[39,45],choos:[3,19,24,31,32,34,38,39,42,44,52,53,54],choose_act:[3,39,42,53],chosen:[3,12,14,24,31,34,39,53],chunk:11,cil:52,cil_ag:2,cilalgorithmparamet:2,classic_control:45,clean:[30,39,45],cli:45,clip:[3,6,8,11,13,27,33,38,52],clip_action_to_spac:38,clip_critic_target:[8,13],clip_gradi:27,clip_high:31,clip_likelihood_ratio_using_epsilon:[7,11],clip_low:31,clip_max:33,clip_min:33,clipbyglobalnorm:27,clipped_ppo_ag:7,clippedppoalgorithmparamet:7,clipping_high:33,clipping_low:33,clone:[3,52],close:30,cmake:45,coach:[0,3,27,29,30,31,32,36,37,39,42,46,47,48,49,52,54],code:[40,42,52],coeffici:[7,11,27,31,35],collect:[3,7,10,11,21,27,28,35,42,47,48,51,53,54],collect_sav:[3,27,53],color:33,com:45,combin:[26,44,51,52],comma:0,command:[42,45,54],common:[39,41,45,54],commun:43,compar:[0,11,19,52],complet:[28,31,42],complex:[27,32,42,44,52,54],compon:[3,16,26,27,31,37,39,42,51,53,54],composit:[3,53],compositeag:[3,53],comput:[27,31],concat:27,concentr:42,condit:[0,3],confid:31,config:[30,54],configur:[3,5,10,39,45,53],confus:42,connect:[12,27],connectionist:10,consecut:[8,13,23],consequ:[21,31],consid:[5,6,31,34,41],consist:[8,13,30,33,34,38,42,49],constant:6,constantli:54,constantschedul:35,constrain:34,construct:[12,27,35],consumpt:33,contain:[0,1,2,3,15,27,28,30,38,40,42,53,54],content:45,contin:43,continu:[1,2,5,8,9,10,13,14,22,31,32,34,38,48],continuous_entropi:31,continuous_exploration_policy_paramet:31,contribut:[4,51],control:[2,3,5,6,7,8,11,27,31,33,41,49,51,52,53],control_suite_environ:30,controlsuiteenviron:30,conveni:[41,54],converg:10,convers:32,convert:[3,28,31,33,38,42,44,53],convolut:[27,44],coordin:34,copi:[8,12,13,15,16,17,18,20,21,22,24,25,26,27,45],core:[3,51,53],core_typ:[3,28,30,38,53,54],correct:[3,6,52],correctli:27,correl:31,correpond:28,correspond:[2,3,4,16,17,27,28,31,33,38,40,53],could:[3,27,38,45,47,53],count:20,countabl:34,counter:[3,53],counterpart:44,cpu:[0,27],crd:54,creat:[3,21,27,33,40,53,54],create_network:[3,53],create_target_network:27,creation:[3,53],credenti:45,critic:[3,6,7,8,11,13,14,31,44,52],crop:[33,34],crop_high:33,crop_low:33,cross:[1,16,26],csv:0,ctrl:41,cuda:45,cudnn7:45,curl:45,curr_stat:[3,39,53],current:[0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,17,18,20,22,23,24,25,27,28,30,31,33,34,38,39,42,51,52,53],custom:[30,31,38,39,42],custom_reward_threshold:30,cycl:42,dai:54,dashboard:[0,3,45,51,53],data:[0,3,10,21,27,35,42,43,45,47,48,51,52,53,54],data_stor:[29,45],dataset:[3,7,11,47,52,53,54],date:[23,44,52,54],dcp:[45,54],ddpg:[14,52],ddpg_agent:8,ddpgalgorithmparamet:8,ddqn:[20,24,52],deal:52,debug:[0,41,51],decai:[5,7,11,27],decid:[0,3,4,30,31,39,53],decis:[3,53],declar:0,decod:45,dedic:27,deep:[0,3,5,12,14,15,17,19,21,22,26,53],deepmind:49,def:[39,40],default_act:38,default_input_filt:40,default_output_filt:40,defin:[0,3,5,6,7,10,11,12,21,23,24,27,28,30,31,33,34,35,38,39,40,42,43,44,48,49,53,54],definit:[3,27,30,38,40,42,53],delai:[3,52],delta:[6,16,23,26],demonstr:[1,2,54],dens:31,densiti:20,depecr:0,depend:[0,3,6,27,33,35,38,40,45,48,52,53],deploi:[37,43,47],depth:30,descend:52,describ:[3,16,25,33,35,39,42,45,53],descript:[3,34,38,46,54],design:[42,45,51],desir:[34,39],destabil:10,detail:[3,28,46,47,49,51,54],determin:[2,3,23,28,35,53],determinist:[3,12,52],dev:45,develop:[42,48],deviat:[10,11,31,33,41],devic:27,dfp:52,dfp_agent:4,dfpalgorithmparamet:4,dict:[3,4,27,28,30,31,38,53],dict_siz:35,dictat:4,dictionari:[2,3,27,28,30,35,38,39,53],did:30,differ:[0,1,2,3,4,5,6,7,10,11,15,19,27,30,31,33,38,39,40,41,43,44,51,52,53],differenti:19,difficult:[41,48],difficulti:54,dimens:[28,30,33,34],dimension:[11,34],dir:[0,3,53,54],direct:[3,30,53],directli:[3,5,14,42,44,53],directori:[0,27,39,41,45,54],disabl:54,disable_fog:30,disappear:30,disassembl:52,discard:[28,33],discount:[8,10,11,13,20,23,24,26,27,28,52],discret:[1,2,4,7,11,14,15,16,17,18,19,20,21,23,24,25,26,31,32,33,34,38,42],disentangl:42,disk:0,displai:[0,41],distanc:38,distance_from_go:38,distance_metr:38,distancemetr:38,distil:[3,53],distribut:[5,6,10,11,12,16,25,26,27,29,31,36,37,38,44,51,52,54],distributed_coach:43,distributed_coach_synchronization_typ:43,distributedcoachsynchronizationtyp:43,divereg:[7,11],diverg:[6,7,11,26],dnd:[0,23,52],dnd_key_error_threshold:23,dnd_size:23,do_action_hindsight:35,doc:45,docker:45,dockerfil:45,document:49,doe:[15,27,33],doesn:43,doing:[7,11,32],domain:44,don:[4,31,41,52],done:[0,3,7,10,11,13,30,33,40,53,54],doom:[30,40,45,49],doom_basic_bc:54,doom_basic_dqn:54,doom_environ:[30,40,54],doomenviron:[30,40],doomenvironmentparamet:[40,54],doominputfilt:40,doomlevel:30,doomoutputfilt:40,doubl:[3,20,26],doubli:53,down:[27,30,52],download:45,dpkg:45,dqn:[3,20,21,26,30,31,33,34,42,44,52],dqn_agent:[18,53],dqnagent:53,dqnalgorithmparamet:18,drive:[2,30,49,51],driving_benchmark:30,due:33,duel:[3,26],dump:[0,3,53],dump_csv:0,dump_gif:0,dump_in_episode_sign:0,dump_mp4:0,dump_one_value_per_episod:[3,53],dump_one_value_per_step:[3,53],dump_parameters_document:0,dump_signals_to_csv_every_x_episod:0,dure:[3,6,7,10,11,12,15,23,31,41,42,53,54],dynam:[41,48,52],e_greedi:31,each:[0,1,2,3,4,5,6,7,10,11,12,15,17,18,19,21,23,24,25,27,28,30,31,32,33,34,35,38,39,41,42,43,44,45,48,52,53],eas:41,easi:[40,41,51],easier:44,easili:[31,47,54],echo:45,effect:[0,3,6,7,21,33,42,53],effici:[6,42,52],either:[0,3,5,21,27,31,38,41,44,54],element:[3,15,27,33,38],elf:45,embbed:27,embed:[3,23,27,53],embedd:[27,44],embedding_merger_typ:27,embeddingmergertyp:27,emploi:52,empti:28,emul:6,enabl:[27,44,54],encod:[33,38],encourag:[22,24,42],end:[2,3,10,26,28,30,33,53,54],enforc:34,engin:[30,49],enough:[4,6,23],ensembl:[31,52],ensur:[6,27],enter:[3,53,54],entir:[11,20,23,26,31,34,42],entri:[23,42],entropi:[1,5,6,7,10,11,12,16,26,31,52],enumer:38,env:[28,45],env_param:40,env_respons:[3,53],enviorn:30,environ:[0,3,4,6,19,27,28,31,32,33,34,38,39,42,45,48,50,51,53],environmentparamet:[30,40],envrespons:[0,3,30,53],episod:[0,3,4,5,10,11,15,20,21,26,30,31,39,40,41,42,43,53,54],episode_max_tim:30,episodic_hindsight_experience_replai:35,epoch:[7,53],epsilon:[7,31,35],epsilon_schedul:31,equal:2,equat:[8,12,13,14,17,18,21,25],error:[13,27,52],escap:54,especi:19,essenti:[21,27,34,40,42,45],estim:[3,5,7,11,15,20,24,31,53],estimate_state_value_using_ga:[5,7,11],eta:[7,11],etc:[0,3,27,30,32,38,39,49,53],evalu:[0,3,12,14,27,28,31,42,47,53],evaluate_onli:0,evaluation_epsilon:31,evaluation_nois:31,even:[19,27,30,40,41,42,47,52],eventu:14,everi:[0,5,6,8,10,12,13,14,15,16,17,18,20,21,22,24,25,26,54],exact:[23,31,48],exactli:[14,27],exampl:[2,3,4,27,28,30,31,32,33,34,38,39,40,42,44,47,53,54],except:[21,28],execut:[28,41,42],exercis:13,exhibit:[3,39,53],exist:[23,27],exit:[3,53],expand_dim:28,expect:[0,3,31,48,53],experi:[0,6,8,11,12,13,14,26,30,35,36,41,42,43,45,47,51,52,54],experiment_path:[0,30],experiment_suit:30,experimentsuit:30,expert:[1,2,28,52],exploit:[31,42],explor:[3,4,5,6,7,8,9,11,13,14,15,20,22,23,39,42,47,51,52],exploration_polici:31,explorationparamet:[3,31,39],exponenti:[6,7,11,26,27],expor:3,export_onnx_graph:0,expos:[41,44,51],extend:[30,31,49],extens:[30,49],extent:54,extern:0,extra:[3,27,28,44,53],extract:[3,22,23,28,33,38,41,42,53],factor:[8,10,11,13,24,26,27,28,31,33],failur:0,faithfulli:41,fake:38,fals:[0,3,8,13,27,28,30,31,34,35,38,40,53],far:[11,33,42,48],faster:[19,52],featur:[8,13,30,44,51,52],feature_minimap_maps_to_us:30,feature_screen_maps_to_us:30,fetch:[27,28],fetched_tensor:27,few:[10,15,16,17,18,20,24,25,26,31,40],field:[48,51],file:[0,3,39,42,53,54],fill:[28,40],filter:[0,3,51,53],filtered_action_spac:38,find:[14,17,41,49,51],finish:[23,54],finit:34,first:[0,8,11,13,14,15,23,25,26,27,28,33,42,44],fit:[13,38],fix:47,flag:[0,3,27,28,30,53],flexibl:43,flicker:30,flow:[14,32,51],follow:[2,3,5,6,8,10,12,13,14,16,17,18,21,22,23,25,26,27,28,30,31,35,39,40,45,48,52,53],footprint:33,forc:[27,30,34,40],force_cpu:27,force_environment_reset:[30,40],force_int_bin:34,forced_attention_s:38,form:[4,21,38,52],format:39,formul:[5,6],forward:[27,31],found:[3,46,47,54],frac:[6,7,12,16,26],fraction:[7,11],frame:[0,30],frame_skip:30,framework:[0,3,27,39,51,53],framework_typ:0,free:[30,49],freeglut3:45,freez:[3,53],freeze_memori:[3,53],frequenc:13,from:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,30,31,32,33,34,36,38,39,40,41,42,43,44,45,47,48,49,51,53,54],full:[3,10,20,34,53],fulldiscreteactionspacemap:34,fulli:27,func:[3,53],further:47,futur:[0,3,10,28,52],future_measurements_weight:4,gae:[5,7,11],gae_lambda:[5,7,11],game:[3,28,30,49,51,53,54],game_ov:28,gamma:[5,6,8,12,13,14,15,16,17,18,20,21,22,23,24,26],gap:[24,52],gather:43,gaussian:[11,12,13,14,31],gener:[0,5,7,11,15,27,30,31,35,38,39,45,54],general_network:39,get:[3,4,7,8,9,10,11,13,14,15,17,18,20,22,24,27,28,30,31,38,42,44,45,48,53],get_act:31,get_action_from_us:30,get_available_kei:30,get_first_transit:28,get_goal:30,get_last_env_respons:30,get_last_transit:28,get_output_head:39,get_predict:[3,53],get_random_act:30,get_rendered_imag:[30,40],get_reward_for_goal_and_st:38,get_state_embed:[3,53],get_transit:28,get_transitions_attribut:28,get_variable_valu:27,get_weight:27,gfortran:45,gif:0,git:45,github:[40,45,48,51],given:[0,1,2,3,4,5,8,10,11,13,14,27,28,30,31,33,34,35,38,39,42,47,53],given_weight:27,global:[3,27,44,53],global_network:27,glx:45,goal:[1,2,3,4,6,27,28,30,35,42,44,52,53],goal_from_st:38,goal_nam:38,goal_spac:30,goal_vector:4,goals_spac:35,goalsspac:[35,38],goaltorewardconvers:38,going:32,good:[14,40,41],gpu:[0,27],gracefulli:54,gradient:[3,5,6,7,11,14,21,23,27,39,52,53],gradientclippingmethod:27,gradients_clipping_method:27,granular:35,graph:0,graphmanag:42,grayscal:[33,38],greedili:42,group:41,grow:26,guidelin:52,gym:[45,49],gym_environ:[30,54],gymenviron:30,gymenvironmentparamet:40,hac:52,had:48,hand:[19,33,42,52],handl:4,handle_episode_end:[3,30,53],handling_targets_after_episode_end:4,handlingtargetsafterepisodeend:4,hard:[41,52],harder:[41,47],has:[0,3,19,23,24,28,31,33,42,44,48,52,53],has_glob:27,has_target:27,hat:[6,7,16,26],have:[0,3,4,6,27,30,31,33,34,35,42,44,47,48,53],head:[1,2,3,5,6,10,15,19,22,23,27,31,39,44,53],headparamet:27,heads_paramet:27,health_gath:30,heat:6,heatup:[31,42],help:[24,28,41,42,52],here:[40,42],heurist:[11,31],hide:44,hierarch:[38,42],hierarchi:[3,42,52,53],high:[8,11,13,33,34,38,41],high_i:38,high_kl_penalty_coeffici:11,high_x:38,higher:11,highest:[5,6,10,14,24,31,33,34,38],highli:[0,40,52],hindsight:[9,35,52],hindsight_goal_selection_method:35,hindsight_transitions_per_regular_transit:35,hindsightgoalselectionmethod:35,hold:[15,27,28,35,41,42,44],horizont:[45,51,54],host:45,hostnam:0,hot:38,how:[4,7,11,31,43,45,52,54],hrl:35,html:45,http:[21,35,45],hub:45,huber:25,huber_loss_interv:25,human:[0,30],human_control:30,hyper:[39,48],hyperparamet:39,ident:27,identifi:[27,38],ies:53,ignor:30,imag:[0,27,30,33,34,38,40,44,54],image1:45,imit:[3,28,46,52],impact:27,implement:[3,7,11,27,29,30,31,35,39,40,43,48,52,54],impli:54,implment:37,importance_weight:27,importance_weight_trunc:6,importantli:42,improv:[5,19,26,30,42,52],improve_reward_model:53,includ:[0,3,4,30,32,33,37,44,49,53,54],incorpor:27,increas:[11,24,33,52],increment:[3,53],index:[0,2,14,28,30,33,34,35,38],indic:38,inf:[33,38],infer:[3,27,30,47,53],infinit:[0,52],info:[3,15,28,38,40,53],info_as_list:28,inform:[3,4,21,28,30,32,41,42,45,49,53],inherit:[3,39,40],init_environment_dependent_modul:[3,53],initi:[3,4,11,24,27,28,39,42,51,53],initial_feed_dict:27,initial_kl_coeffici:11,initialize_session_dependent_compon:[3,53],innov:52,input:[1,2,3,4,8,13,14,15,17,18,20,22,23,24,27,32,38,42,44,53],input_embedders_paramet:27,input_high:33,input_low:33,input_space_high:34,input_space_low:34,inputembedderparamet:27,inputfilt:42,insert:[23,28],inspect:0,instal:[45,54],instanc:[3,36,38,44],instanti:[3,30,42],instead:[0,3,7,21,24,27,33,34,42,52,53],instruct:54,intact:[15,48],integ:[0,33,34],integr:[40,42,43,47,51],intel:51,intend:[10,27,31,42],interact:[28,42,43,47,51,54],interest:[27,41],interfac:[30,41,43,49],intermedi:23,intern:[3,10,21,27,28,32,42,53,54],intersect:52,interv:25,intro:51,introduc:52,invers:[30,49],invok:42,involv:39,is_empti:28,is_valid_index:38,item:28,iter:[3,5,6,8,11,13,19,27,53],its:[0,3,16,26,27,28,31,38,42,45,52,53,54],itself:[27,38,54],job:0,job_typ:0,joint:30,json:0,jump:[4,34],jupyt:39,just:[3,11,24,26,40,42,44,47,53,54],kapa:25,keep:[18,28,33,54],kei:[2,23,27,28,30,35,39,41,45,52,54],key_error_threshold:35,key_width:35,keyboard:[30,54],keyword:27,kl_coeffici:27,kl_coefficient_ph:27,know:[3,52,53,54],knowledg:[3,42,53],known:[28,41,48,52],kubeconfig:37,kubernet:45,kubernetes_orchestr:37,kubernetesparamet:37,kwarg:[27,30],l2_norm_added_delta:23,l2_regular:27,lack:41,lamb:31,lambda:[5,7,11,31],lane:2,larg:[14,31,33,49],larger:27,last:[4,6,11,23,28,30,33],last_env_respons:30,lastli:42,later:[0,3,27,53,54],latest:[21,23,42,45],layer:[27,31,35,42,44],lazi:[28,33],lazystack:33,lbfg:27,ld_library_path:45,lead:31,learn:[0,3,4,5,6,8,9,10,12,14,15,16,17,18,19,22,25,26,27,28,30,31,33,41,42,44,46,48,49,50,52,53],learn_from_batch:[3,39,42,53],learner:27,learning_r:[27,35],learning_rate_decay_r:27,learning_rate_decay_step:27,least:[44,52],leav:[11,15],left:[2,6,12,52],length:[4,5,7,11,21,23,27,28],less:[19,52],level:[0,3,27,30,40,53,54],levelmanag:[3,42,53],levelselect:30,libatla:45,libav:45,libavformat:45,libbla:45,libboost:45,libbz2:45,libfluidsynth:45,libgl1:45,libglew:45,libgm:45,libgstream:45,libgtk2:45,libgtk:45,libjpeg:45,liblapack:45,libnotifi:45,libopen:45,libosmesa6:45,libportmidi:45,librari:[30,45,49],libsdl1:45,libsdl2:45,libsdl:45,libsm:45,libsmpeg:45,libswscal:45,libtiff:45,libwebkitgtk:45,libwildmidi:45,like:[12,30,38,42,44,45,47,52],likelihood:[7,11],line:[3,42,53,54],linear:34,linearboxtoboxmap:34,linearli:34,list:[0,3,4,27,28,30,31,33,34,38,39,53,54],load:[0,3,41,43,53,54],load_memory_from_fil:[3,53],load_memory_from_file_path:54,local:[3,44,45,53],locat:[25,28,33,52],log:[0,3,5,6,10,12,53],log_to_screen:[3,53],logger:[0,3,53],look:[40,45],loop:42,loss:[1,2,3,6,7,10,11,16,17,18,25,26,27,31,39,44,53],lot:[31,41,47,48,52],low:[8,11,13,33,34,38],low_i:38,low_x:38,lower:[0,35,42],lowest:[33,34,38],lstm:44,lumin:33,lvert:[6,16,26],lvl:54,mai:[0,27,46,54],main:[3,39,42,44,46,53,54],mainli:43,major:31,make:[0,3,27,30,39,41,45,47,48,52,53],manag:[3,27,43,45,53],mandatori:[38,40,44],mani:[3,19,46,48],manner:[11,20,21,24,33,42],manual:45,map:[3,27,30,32,33,34,38,39,53],mark:28,markdown:53,mask:[15,34],masked_target_space_high:34,masked_target_space_low:34,master:[3,42,45,53],match:[2,23,27,38],mathbb:[5,6],mathcal:13,mathop:5,max:[5,6,13,16,21,26,33],max_a:[15,18,23,24],max_action_valu:28,max_episodes_to_achieve_reward:0,max_fps_for_human_control:0,max_kl_diverg:6,max_over_num_fram:30,max_simultaneous_selected_act:38,max_siz:35,max_spe:30,maxim:[4,17],maximum:[0,12,16,18,23,24,28,30,31,33,35,52],mdp:47,mean:[0,2,7,8,9,10,11,12,13,14,22,27,31,33,34,38,41,47,52],meant:44,measur:[3,4,27,30,33,38,40,52,53],measurements_nam:38,mechan:[32,43,48,54],memor:52,memori:[3,26,28,33,39,42,43,45,51,52,53],memory_backend:45,memorygranular:35,memoryparamet:[3,39],merg:[27,30],mesa:45,method:[0,5,7,11,13,21,27,33,35],metric:[0,38,41],mid:6,middlewar:[23,27,44],middleware_paramet:27,middlewareparamet:27,midpoint:25,might:[3,10,30,39,44,53],min:[6,7,13,16,24,26],min_:[12,13],min_reward_threshold:0,mind:54,minim:[2,4,16],minimap_s:30,minimum:[0,7,13,33],mitig:52,mix:[3,7,11,23,24,52],mixedmontecarloalgorithmparamet:20,mixer1:45,mixtur:[20,27],mjkei:45,mjpro150:45,mjpro150_linux:45,mkdir:45,mmc:[20,52],mmc_agent:20,mode:[24,27,29,36,37,42,43,45,54],model:[0,20,22,27,51,53,54],modif:52,modifi:6,modul:[3,39,42,43,53],modular:[39,42,44,51],monitor:43,mont:[3,24],monte_carlo_mixing_r:[20,24],more:[3,8,13,21,27,33,39,41,42,44,45,47,51,53,54],moreov:41,most:[3,10,23,27,28,31,44,48,52,53,54],mostli:[33,42],motiv:42,move:[6,7,11,33,41,48],mp4:0,mse:[2,6,17,18,25],much:[7,11,42,52],mujoco:[30,34,40,45,49],mujoco_kei:45,mujoco_pi:45,multi:[11,27,38,44],multiarrai:[3,53],multidimension:38,multipl:[4,7,11,21,27,30,31,33,34,35,38,41,42,48,51,54],multipli:[4,10,27,33],multiselect:34,multitask:[30,49],must:[27,33,38,48],mxnet:54,n_step:[23,26,28,35],n_step_discounted_reward:28,n_step_q_ag:21,nabla:[6,8,13,14],nabla_:[8,12,13,14],nabla_a:[8,13,14],naf:52,naf_ag:22,nafalgorithmparamet:22,name:[3,27,28,30,33,38,39,45,53,54],namespac:37,nasm:45,nativ:[0,30,40,49],native_rend:0,navig:3,ndarrai:[3,27,28,30,31,33,34,38,40,53],nearest:23,neat:41,nec:[0,52],nec_ag:23,necalgorithmparamet:23,necessari:[3,23,27,53],necessarili:33,need:[0,3,6,26,27,30,31,38,39,42,48,52,53,54],neg:[4,33],neighbor:23,neighborhood:14,neon_compon:39,nervanasystem:45,network:[0,3,27,31,39,42,48,51,52,53,54],network_input_tupl:27,network_nam:[3,53],network_param:31,network_paramet:27,network_wrapp:[3,27,53],networkparamet:[3,27,31,39],networkwrapp:[3,53],neural:[3,20,27,44,48],never:27,new_value_shift_coeffici:[23,35],new_weight:27,newli:[24,40,47,52],next:[0,3,8,13,14,17,18,22,24,25,28,30,42,53,54],next_stat:28,nfs_data_stor:29,nfsdatastoreparamet:29,nice:54,no_accumul:27,node:[27,44],nois:[8,9,13,14,22,31,42,52],noise_as_percentage_from_action_spac:31,noise_schedul:31,noisi:[10,26,31],non_episod:35,none:[0,3,7,8,11,13,27,28,30,31,33,34,38,40,53],nor:[],norm:27,norm_unclipped_grad:27,norm_unclippsed_grad:27,normal:[3,4,10,31,32,33,38],note:[23,27,31,53],notebook:39,notic:[27,52],notori:[41,48,52],now:[7,40],nstepqalgorithmparamet:21,nth:26,num_act:[23,35,38],num_bins_per_dimens:34,num_class:35,num_consecutive_playing_step:[3,8,13,53],num_consecutive_training_step:[3,53],num_gpu:0,num_neighbor:35,num_predicted_steps_ahead:4,num_speedup_step:30,num_steps_between_copying_online_weights_to_target:[8,12,13,21],num_steps_between_gradient_upd:[5,6,10,21],num_task:0,num_training_task:0,num_transitions_to_start_replai:6,num_work:0,number:[0,2,4,5,6,8,10,12,13,15,16,21,23,25,26,27,28,30,31,33,34,35,41,49,53,54],number_of_knn:23,numpi:[3,27,28,30,31,33,34,38,40,53],nvidia:45,object:[0,3,26,27,30,31,33,35,42,53],observ:[0,3,4,11,27,28,30,32,40,42,53],observation_reduction_by_sub_parts_name_filt:33,observation_space_s:27,observation_space_typ:30,observation_stat:33,observation_typ:30,observationspac:38,observationspacetyp:30,observationtyp:30,off:[3,6,12,43,47,52,53],offer:[30,49],often:[41,42,44,47],old:[7,11,27,52],old_weight:27,onc:[0,7,10,11,15,16,17,18,20,21,24,25,26,27,38,54],one:[0,3,6,19,23,24,27,28,30,31,32,35,38,40,41,44,47,52,53],ones:[40,52],onli:[0,3,4,5,6,7,10,11,15,16,18,19,21,23,25,26,27,28,30,31,33,34,40,42,52,53,54],onlin:[8,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,42,44,47],online_network:27,onnx:[0,27],onto:32,open:[0,30,49],openai:[45,49],opencv:45,oper:[24,27,33],ops:27,optim:[3,4,6,27,46,52],optimization_epoch:7,optimizer_epsilon:27,optimizer_typ:27,option:[6,10,27,30,34,38,39,41,43,44,54],orchestr:[43,45,51],order:[0,3,5,6,7,8,10,11,12,13,14,17,18,19,21,22,23,24,25,27,28,32,33,34,41,42,44,47,48,52,53],org:[21,35],origin:[21,33,34,48],ornstein:[8,9,31],other:[0,2,10,19,24,27,30,32,33,35,41,42,52],otherwis:[11,15,27,30,31,38],ou_process:31,our:7,out:[2,17,18,31,32,34,41,45,51,52,54],outcom:[31,42],output:[0,4,6,8,13,14,15,16,22,23,27,31,32,33,38,39,44],output_0_0:27,output_observation_spac:33,outputfilt:42,outsid:[4,31],over:[3,7,10,11,21,23,26,27,28,31,33,34,41,42,52,53],overestim:[8,13,52],overfit:11,overhead:0,overlai:41,overrid:[3,53],override_existing_kei:35,overriden:39,overview:42,overwhelm:42,overwritten:27,own:[27,39],p_j:[16,26],page:[3,48],pair:[0,38],pal:[24,52],pal_ag:24,pal_alpha:24,palalgorithmparamet:24,paper:[5,10,12,16,21,23,25,30,35,48],parallel:[6,27,41,44],parallel_predict:27,param:[3,27,28,29,30,31,36,37,39,40,53],paramet:[2,3,4,5,6,7,8,10,11,12,13,16,20,21,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,39,40,48,51,53,54],parameter_nois:31,parameters_server_host:0,parent:[3,27,53],parent_path_suffix:[3,27,53],parmet:3,pars:42,part:[0,3,15,27,28,31,33,34,43,44,48,52,53],part_nam:33,partial:34,partialdiscreteactionspacemap:34,particular:4,particularli:[30,31,38,48,52],pass:[0,4,8,9,13,14,22,23,27,30,31,32,40,41,42,44,54],patamet:23,patchelf:45,patchelf_0:45,path:[0,3,27,39,40,45,53,54],pattern:42,pdf:35,penal:[7,8,11,13],penalti:11,pendulum_hac:40,pendulum_with_go:40,pendulumwithgo:40,per:[0,3,4,38,39,42,53],percentag:31,percentil:31,perceptron:44,perform:[0,3,6,27,28,33,35,40,41,42,52,53],period:[44,54],persist:3,persistent_advantage_learn:24,perspect:16,phase:[3,6,7,8,9,11,12,13,14,27,30,31,42,53],phi:[16,26],physic:[30,49],pi_:[6,7,12],pick:[12,30],pickl:54,pickledreplaybuff:54,pip3:45,pip:45,pixel:30,place:[34,41,42],placehold:[27,31],plai:[0,3,10,15,17,18,21,31,39,41,53],plain:44,planarmap:30,planarmapsobservationspac:33,platform:[30,49],pleas:[21,48],plu:27,plugin:45,point:[33,38,42,43],polici:[1,3,4,5,6,9,12,14,15,21,22,23,29,39,42,43,44,45,46,47,51,52,53],policy_gradient_rescal:[5,7,10,11],policy_gradients_ag:10,policygradientalgorithmparamet:10,policygradientrescal:[5,7,10,11],policyoptimizationag:39,popul:42,popular:[30,49],port:0,posit:[4,33],possibl:[2,3,4,23,31,34,38,41,44,51,52,53,54],post:[32,51],post_training_command:[3,53],power:[30,49],ppo:[7,11,52],ppo_ag:11,ppoalgorithmparamet:11,pre:[8,13,31,32],predefin:[15,24,31,54],predict:[1,2,3,5,6,7,8,11,12,13,15,16,17,18,24,25,26,27,31,44,52,53],prediction_typ:[3,53],predictiontyp:[3,53],prefect:52,prefer:27,prefix:[3,53],prep:45,prepar:[3,53],prepare_batch_for_infer:[3,53],present:[19,23,27,30,33,52],preset:[0,5,39,40,42,43,45,54],press:[41,54],prevent:[8,11,13,42],previou:33,previous:[11,27],print:[0,3,54],print_networks_summari:0,priorit:[26,35],prioriti:[26,35],privat:38,probabilit:[5,6],probabl:[3,5,6,10,15,16,26,28,31,39,52,53],problem:52,procedur:6,process:[0,3,8,9,27,31,32,33,34,39,41,42,44,47,48,51,53],produc:27,progress:27,project:[16,26],propag:7,propagate_updates_to_dnd:23,properti:[3,27,28,30,35,39,40,45,53],proport:35,proto:14,provid:[27,43],proxi:42,proxim:3,pub:[36,37,45],publish:48,purpos:[0,3,10],pursuit:2,push:[3,53],pybullet:[30,49],pygam:[0,45],pytest:45,python3:45,python:[30,35,39,45,49,51],q_i:12,qr_dqn_agent:25,quad:6,qualiti:30,quantil:[3,52],quantileregressiondqnalgorithmparamet:25,queri:[23,27,42,52],question:52,quit:[41,47],r_i:[5,21],r_t:[4,6,7,26],rainbow:[3,39,52],rainbow_ag:39,rainbow_dqn_ag:26,rainbowag:39,rainbowagentparamet:39,rainbowalgorithmparamet:39,rainbowdqnalgorithmparamet:26,rainbowexplorationparamet:39,rainbowmemoryparamet:39,rainbownetworkparamet:39,rais:[3,28,53],ramp:[39,42],random:[0,21,30,31,38,42,48],random_initialization_step:30,randomli:[28,42],rang:[4,7,8,11,13,16,26,30,33,34,38,52],rare:23,rate:[0,6,20,23,27,30,44],rate_for_copying_weights_to_target:[6,8,12,13],rather:[4,12,41],ratio:[6,7,11,20,33],ratio_of_replai:6,raw:[30,49],reach:[0,11,38],read:[0,29],read_csv_tri:0,readabl:42,readm:45,real:3,reason:[33,48],rebuild_on_every_upd:35,receiv:[27,28],recent:[3,26,27,52,53],recommend:40,redi:[36,37,45],redispubsub:45,redispubsubmemorybackendparamet:36,reduc:[1,2,10,11,24,27,33,42,52],reduct:33,reduction_method:33,reductionmethod:33,redund:33,refer:[2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,43,45],referenc:3,regard:[3,53],region:[6,52],regist:[3,53],register_sign:[3,53],registri:45,regress:[2,3,52],regula:[6,7,11],regular:[5,7,10,11,21,23,27,31,34,35,52],regularli:27,reinforc:[3,5,8,9,10,12,14,16,17,18,19,21,24,25,26,30,31,41,42,44,46,48,49,50,52],relat:[27,45],relationship:52,releas:[0,51,52],relev:[3,15,31,33,53],remov:[0,33],render:[0,3,30,40],reorder:33,repeat:[30,42],replac:[31,33,35,45],replace_mse_with_huber_loss:27,replai:[1,2,3,6,8,12,13,14,15,16,17,18,21,23,24,25,26,35,42,52,53,54],replay_buff:54,replicated_devic:27,repo:40,repositori:51,repres:[0,7,11,16,26,27,28,30,31,34,38,54],represent:44,reproduc:[42,48],request:[3,27,53],requir:[3,27,29,31,33,41,44,45,52,53],requires_action_valu:31,rescal:[4,5,7,10,11,27,32,33],rescale_factor:33,research:[30,48,49],reset:[3,23,27,30,31,40,53],reset_accumulated_gradi:27,reset_evaluation_st:[3,53],reset_gradi:27,reset_internal_st:[3,30,53],resourc:[43,45],respect:[8,13,14,28,30],respons:[3,28,30,42,53],rest:[27,28,34,45],restart:40,restor:[0,3,53],restore_checkpoint:[3,53],result:[3,4,13,16,17,18,19,25,26,27,33,34,48,52,53,54],ret:6,retrac:6,retri:0,retriev:[23,35],return_additional_data:35,reus:42,reusabl:44,reward:[0,1,2,3,4,8,10,13,20,21,26,27,28,30,32,38,40,41,42,52,53],reward_test_level:0,reward_typ:38,rgb:[30,33,38],rho:[6,8,13,14],rho_t:6,right:[2,3,6,12,31,34,41,52,53],rl_coach:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,16,18,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,40,45,53,54],rms_prop_optimizer_decai:27,rmsprop:27,roboschool:[30,49],robot:[30,38,49,51],roboti:45,robust:53,rollout:[29,36,37,43,45,54],root:[41,45],rule:[8,13,14,15],run:[0,3,4,8,10,11,12,13,14,15,17,18,23,24,27,30,31,33,53,54],run_off_policy_evalu:[3,53],run_pre_network_filter_for_infer:[3,53],runphas:[3,53],runtim:45,rvert:[16,26],rvert_2:6,s3_bucket_nam:45,s3_creds_fil:45,s3_data_stor:29,s3_end_point:45,s3datastoreparamet:29,s_t:[4,5,6,8,12,13,14,15,16,17,18,20,21,22,24,26],sac:52,sai:52,same:[3,4,7,10,13,14,20,21,24,27,30,34,35,41,44,48,52,53],sampl:[1,2,3,5,6,8,10,11,12,13,14,15,16,17,18,20,21,24,25,26,27,31,35,38,42,45,53],sample_with_info:38,satur:[8,13],save:[0,3,26,27,31,45,53,54],save_checkpoint:[3,53],saver:[3,27,53],savercollect:[3,27,53],scale:[4,10,27,33,41,45,51,54],scale_down_gradients_by_number_of_workers_for_sync_train:27,scale_measurements_target:4,scaler:27,schedul:[7,31,35,42,43,45,54],scheme:[5,31,42,52],schulman:11,sci:45,scienc:48,scipi:[33,45],scope:27,scratch:52,scratchpad:0,screen:[3,30,40,54],screen_siz:30,script:42,second:[0,27,41,52,54],section:[45,46,49],see:[3,30,33,45,48,49,52,53,54],seed:[0,30,48],seen:[4,23,24,30,33,42,48,52],segment:[30,38],select:[5,15,23,27,28,31,33,34,38,40,41,42,51,54],self:[3,27,39,40,53],send:[40,44],separ:[0,3,19,33,34,44,46,47,52],separate_actions_for_throttle_and_brak:30,seper:10,sequenti:[4,28,35],serv:[7,10,44],server:0,server_height:30,server_width:30,sess:[3,27,53],session:[3,27,53],set:[0,2,3,4,5,6,7,8,11,13,16,17,18,20,23,24,26,27,28,30,31,33,34,38,39,43,48,49,51,52,53,54],set_environment_paramet:[3,53],set_goal:30,set_incoming_direct:[3,53],set_is_train:27,set_sess:[3,53],set_variable_valu:27,set_weight:27,setup:[3,45,47,53],setup_logg:[3,53],setuptool:45,sever:[0,3,7,10,11,15,27,30,31,33,39,40,41,42,44,49,52,53,54],shape:[27,33,38],share:[0,3,27,35,44,53],shared_memory_scratchpad:0,shared_optim:27,shift:[34,42],shine:41,should:[0,3,4,7,11,15,21,24,27,28,30,33,35,38,39,40,43,53,54],should_dump:0,shouldn:15,show:48,shown:48,shuffl:[3,28,53],side:[3,53],sigma:[13,31],signal:[3,42,53],signal_nam:[3,53],significantli:19,sim:[6,12],similar:[7,19,21,28,30,34,52],simpl:[10,35,39,40,44,51,52,54],simplest:52,simplif:52,simplifi:[7,41,44],simul:[30,40,47,49,54],simultan:7,sinc:[3,7,8,10,13,21,23,24,26,27,31,33,47,53],singl:[3,4,5,6,7,11,15,19,20,21,27,28,30,31,34,38,41,42,44,53],size:[27,28,31,33,34,35,38],skill:52,skip:[30,42],slave:[3,53],slice:28,slow:[27,52,54],slower:[0,13,19,27],slowli:[8,13],small:[7,13,23,35],smaller:31,smooth:[41,52],soft:[3,8,11,13,14,22,52],soft_actor_critic_ag:12,softactorcriticalgorithmparamet:12,softmax:[27,31],softmax_temperatur:27,softwar:45,sole:47,solut:52,solv:[33,40,49,51],some:[0,3,11,27,28,31,33,39,40,41,44,47,48,52,53,54],sort:25,sourc:[0,1,2,3,4,5,6,7,8,10,11,12,13,14,16,18,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,36,37,38,40,45,49,53],space:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,30,31,32,33,34,35,42,51,53],spacesdefinit:[3,27,53],spatial:52,spawn:[43,45],special:19,specif:[0,3,15,19,23,27,28,39,42,54],specifi:[0,27,30,31,33,40,43,54],speed:[27,33,52],speedup:54,spread:[33,34],squar:33,squeeze_list:27,squeeze_output:27,src:45,stabil:[6,21,27,52],stabl:[44,52],stack:[3,32,33,38,53],stack_siz:[27,33],stacking_axi:33,stage:44,stai:48,standard:[7,10,11,15,31,33,41,47],starcraft2_environ:30,starcraft2environ:30,starcraft:[38,49],starcraftobservationtyp:30,start:[3,6,8,11,12,13,14,19,24,28,33,34,40,45,53],state:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,20,21,22,23,24,25,26,27,28,30,33,35,38,39,40,42,44,46,52,53],state_key_with_the_class_index:[2,35],state_spac:30,state_valu:28,statist:[3,10,33,51,53],std:12,stdev:31,steep:31,step:[0,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,22,23,24,25,26,27,28,30,31,33,39,40,41,42,52,53,54],stepmethod:[8,12,13,21],stochast:[12,42,52],stop:[0,30],store:[0,3,23,26,28,30,33,35,41,42,43,45,51,53,54],store_transitions_only_when_episodes_are_termin:26,str:[0,2,3,4,21,27,28,30,31,33,34,38,53],strategi:[30,49],stream:[19,43],strict:48,string:[0,27,30],structur:[0,3,28,35,39,42,53],stuff:27,style:31,sub:[34,35,36,37,38,39,42,45,54],sub_spac:38,subset:[41,48,52],subtract:24,succeed:30,success:[0,30,52],suffer:41,suffici:28,suffix:[3,27,53],suggest:39,suit:[0,49],suitabl:[43,54],sum:[4,7,10,20,27,28],sum_:[5,12,16,20,21,23,26],summari:[0,3,53],supervis:52,suppli:[3,53],support:[0,3,27,30,31,41,44,45,46,47,49,51,54],sure:[0,3,45,48,53],surrog:7,surround:14,swig:45,swingup:30,symbol:27,sync:[3,27,42,43,53],synchron:[0,27,42,44],system:47,t_max:[10,21],tag:45,take:[0,3,10,11,19,23,24,27,30,31,32,40,41,42,53],taken:[1,2,4,5,6,7,8,11,12,13,16,19,23,24,25,26,27,28,30,31],tanh:[8,13],tar:45,target:[0,1,2,3,4,5,6,7,8,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,30,33,34,38,39,42,44,53],target_act:34,target_kl_diverg:11,target_network:27,target_success_r:30,targets_horizon:21,task:[0,1,2,30,33,39,41,49],task_index:0,tau:12,td3:52,td3_agent:13,td3algorithmparamet:13,techniqu:[7,11,51,52],technolog:43,teh:27,temperatur:[27,31],temperature_schedul:31,tensor:[3,27,53],tensorboard:0,tensorflow:[0,3,27,53,54],tensorflow_support:27,term:[6,7,11],termin:[3,8,13,28,42,53],test:[0,3,5,6,8,9,10,11,12,13,14,27,39,48,51,54],test_using_a_trace_test:0,text:6,textrm:42,than:[0,3,11,13,27,31,41,44,47,53],thei:[3,23,24,27,31,41,42,43,52,53,54],them:[4,5,10,21,27,28,30,33,38,40,41,44],therefor:[0,8,13,27,32,52],theta:[6,7,8,12,13,14,16,26,31],theta_:[6,7],thi:[0,3,4,5,6,7,8,10,11,13,15,19,21,23,26,27,28,30,31,32,33,34,35,36,38,39,40,41,42,43,44,45,47,48,52,53,54],thing:[41,47],those:[0,3,8,13,14,15,17,18,19,23,28,31,34,42,44,46,52,53],thousand:[11,15,16,17,18,20,24,25,26],thread:27,three:[3,43,44,45,46],threshold:[11,23,33],through:[0,3,4,8,9,10,11,13,14,15,23,24,27,39,40,42,44,53],tild:[8,12,13,14],time:[0,4,24,27,31,34,35,41,44,52],time_limit:40,timestep:[4,10],timid:45,tmp:0,togeth:[3,21,28,42,53],toggl:41,too:11,tool:[41,45,52],top:[27,30,32,33,35,40,41,52],torqu:30,total:[0,3,10,11,20,23,24,28,35,39,41,52,53],total_loss:27,total_return:28,trace:0,trace_max_env_step:0,trace_test_level:0,tradeoff:31,train:[0,3,19,27,31,36,37,39,40,41,42,43,44,47,48,51,52,53],train_and_sync_network:27,train_on_batch:27,train_to_eval_ratio:35,trainer:[29,43],transfer:[30,36,49],transit:[1,2,3,4,5,6,8,10,11,12,13,14,16,17,18,21,23,24,25,26,35,39,42,43,53],transition_idx:28,tree:14,tri:52,trick:48,tricki:41,trigger:[30,45],truncat:6,truncated_norm:31,trust:[6,52],ttf2:45,tune:31,tupl:[1,2,3,8,13,27,28,30,35,38,39],turn:[2,52],tutori:[39,40,47],tweak:[3,53],twin:3,two:[8,10,13,21,27,30,31,32,33,34,38,40,43,44,52,54],txt:45,type:[0,3,10,19,27,30,33,38,39,42,44,51,52,53,54],typic:[7,11,27,52,54],ubuntu16:45,uhlenbeck:[8,9,31],uint8:33,unbound:38,uncertain:31,uncertainti:31,unchang:11,unclip:[3,39,53],uncorrel:21,undeploi:43,under:[3,27,39,54],underbrac:5,understand:54,unifi:7,uniformli:[30,31,34,38],union:[3,28,30,31,34,38,53],uniqu:27,unit:41,unlik:[11,14],unmask:34,unnecessari:0,unshar:[3,53],unsign:33,unspecifi:27,unstabl:[41,48],until:[0,6,10,11,23,26,31],unus:27,unzip:45,updat:[3,6,7,8,10,11,12,13,14,15,16,17,18,19,21,22,23,24,25,26,27,28,31,39,40,41,42,44,45,52,53],update_discounted_reward:28,update_filter_internal_st:[3,53],update_log:[3,53],update_online_network:27,update_step_in_episode_log:[3,53],update_target_network:27,update_transition_before_adding_to_replay_buff:[3,53],upgrad:45,upon:[3,5,39,53],upper:[6,31],usag:[34,47,51],use:[0,1,2,3,4,5,6,8,9,10,12,13,14,15,17,18,22,27,28,29,30,31,33,34,35,38,39,40,42,44,45,47,51,52,53,54],use_accumulated_reward_as_measur:4,use_cpu:0,use_deterministic_for_evalu:12,use_full_action_spac:30,use_inputs_for_apply_gradi:27,use_kl_regular:[7,11],use_non_zero_discount_for_terminal_st:[8,13],use_separate_networks_per_head:27,use_target_network_for_evalu:[8,13],use_trust_region_optim:6,used:[0,2,3,5,6,7,8,10,11,12,13,14,15,16,20,21,22,23,24,25,27,30,31,33,34,35,36,37,39,40,42,43,44,47,48,53,54],useful:[0,3,4,26,27,31,33,38,48,52,53,54],user:[27,30,31,41,42,45],userguid:45,uses:[0,1,7,11,19,28,29,31,37,42,43,45,48,52,54],using:[0,3,5,6,7,8,10,11,12,13,14,17,18,20,21,22,23,24,26,27,29,30,31,33,36,39,40,41,43,47,49,52,53,54],usr:45,usual:[33,42],util:[3,41,53],v_max:16,v_min:16,val:[3,38,53],valid:[0,14,38],valu:[0,2,3,4,5,6,7,8,11,12,13,14,15,16,17,18,19,21,22,23,24,26,27,28,30,31,33,34,35,38,39,42,44,45,46,52,53],valuabl:41,value_targets_mix_fract:[7,11],valueexcept:[3,53],valueoptimizationag:39,van:4,vari:44,variabl:[27,30,45],variable_scop:27,varianc:[10,31,41,52],variant:[31,35,52],variou:[3,28,35,51],vector:[3,4,8,9,11,13,15,27,30,33,38,40,44,52,53],vectorobservationspac:33,verbos:30,veri:[0,7,8,10,13,19,23,41,52,54],version:[7,11,28],versu:27,vert:12,vertic:27,via:[2,15],video:[0,3,30],video_dump_method:0,view:41,viewabl:[3,53],visit:48,visual:[0,3,30,49,51],visualization_paramet:30,visualizationparamet:[3,30],vizdoom:[45,49],vote:31,wai:[3,7,11,31,34,40,42,44,51,52,53,54],wait:[5,27,43],walk:40,want:[3,4,26,27,33,34,35,47,53],warn:[31,33,34],wasn:28,weather_id:30,websit:[30,51],weight:[4,5,6,7,8,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,27,31,42,44,52],well:[23,27,31,38,52],went:11,were:[4,16,17,18,19,23,25,26,27,28,34,48],west:45,wget:45,what:[11,47,52],whatev:[3,53],when:[0,3,4,5,6,7,8,9,10,11,12,13,14,23,27,28,29,30,31,33,36,37,39,40,41,53,54],whenev:43,where:[2,3,4,5,6,7,11,15,16,19,21,23,24,26,27,28,30,31,33,34,38,41,47,52,53],whether:31,which:[0,1,2,3,5,6,7,8,10,11,12,13,14,15,19,21,22,23,24,25,27,28,29,30,31,33,35,36,37,38,39,40,41,42,43,44,46,47,48,49,51,52,53,54],who:42,why:[41,42],window:[33,34],wise:33,within:[0,7,11,22,31,38,41],without:[5,11,34,35,41,52,54],wolperting:3,wolpertinger_ag:14,wolpertingeralgorithmparamet:14,won:[4,27],wont:27,work:[3,21,27,31,33,34,41,42,52,53,54],workaround:0,workdir:45,worker:[0,21,27,29,33,35,36,37,41,43,44,45,52,54],worker_devic:27,worker_host:0,wors:52,would:[27,45,47,52],wrap:[30,33,42,49],wrapper:[3,27,28,30,38,44,53],write:[0,3,53],written:[3,26,29,53],www:45,xdist:45,y_t:[8,12,13,14,15,17,18,20,22,23,24],year:52,yet:[19,40],you:[4,33,35,39,40,45,51,54],your:[39,40,45,54],yuv:33,z_i:[16,26],z_j:[16,26],zero:[2,13,17,18],zip:45,zlib1g:45},titles:["Additional Parameters","Behavioral Cloning","Conditional Imitation Learning","Agents","Direct Future Prediction","Actor-Critic","ACER","Clipped Proximal Policy Optimization","Deep Deterministic Policy Gradient","Hierarchical Actor Critic","Policy Gradient","Proximal Policy Optimization","Soft Actor-Critic","Twin Delayed Deep Deterministic Policy Gradient","Wolpertinger","Bootstrapped DQN","Categorical DQN","Double DQN","Deep Q Networks","Dueling DQN","Mixed Monte Carlo","N-Step Q Learning","Normalized Advantage Functions","Neural Episodic Control","Persistent Advantage Learning","Quantile Regression DQN","Rainbow","Architectures","Core Types","Data Stores","Environments","Exploration Policies","Filters","Input Filters","Output Filters","Memories","Memory Backends","Orchestrators","Spaces","Adding a New Agent","Adding a New Environment","Coach Dashboard","Control Flow","Distributed Coach - Horizontal Scale-Out","Network Design","Usage - Distributed Coach","Algorithms","Batch Reinforcement Learning","Benchmarks","Environments","Features","Reinforcement Learning Coach","Selecting an Algorithm","test","Usage"],titleterms:{"final":23,"function":22,"new":[39,40],"switch":54,Adding:[39,40],Using:40,acer:6,across:52,action:[4,5,6,7,8,9,10,11,12,13,14,15,22,23,34,38,52],actioninfo:28,actor:[5,9,12],addit:[0,54],additivenois:31,advantag:[22,24],agent:[3,39,42,54],algorithm:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,46,52,54],api:40,architectur:27,attentionactionspac:38,backend:36,balancedexperiencereplai:35,batch:[28,47],behavior:1,benchmark:48,between:54,blizzard:30,boltzmann:31,bootstrap:[15,31],boxactionspac:38,build:45,can:52,carla:30,carlo:20,categor:[16,31],choos:[4,5,6,7,8,9,10,11,12,13,14,15,22,23],clip:7,clone:[1,45],coach:[40,41,43,45,51],collect:52,compar:41,compoundactionspac:38,condit:2,config:45,contain:45,continu:[7,11,12,52],continuousentropi:31,control:[23,30,42],copi:44,core:28,creat:45,critic:[5,9,12],dashboard:41,data:29,deep:[8,13,18,54],deepmind:30,delai:13,demonstr:52,descript:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],design:44,determinist:[8,13],direct:4,discret:[5,6,10,52],discreteactionspac:38,distribut:[43,45],distributedtaskparamet:0,doe:52,doubl:17,dqn:[15,16,17,19,25],duel:19,dump:54,egreedi:31,environ:[30,40,49,52,54],envrespons:28,episod:[23,28,35],episodicexperiencereplai:35,episodichindsightexperiencereplai:35,episodichrlhindsightexperiencereplai:35,evalu:54,experiencereplai:35,explor:31,explorationpolici:31,featur:50,file:45,filter:[32,33,34],flag:54,flow:42,framework:54,from:52,futur:4,gener:19,gif:54,goal:38,gradient:[8,10,13],graph:42,greedi:31,gym:[30,40],have:52,hierarch:9,horizont:43,human:[52,54],imag:45,imageobservationspac:38,imit:[2,54],implement:45,input:33,interfac:45,keep:44,kubernet:37,learn:[2,21,24,47,51,54],level:42,manag:42,memori:[35,36],mix:20,mont:20,more:52,multi:54,multipl:52,multiselectactionspac:38,network:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,44],networkwrapp:27,neural:23,nfsdatastor:29,node:[52,54],non:35,normal:22,observ:[33,38],observationclippingfilt:33,observationcropfilt:33,observationmoveaxisfilt:33,observationnormalizationfilt:33,observationreductionbysubpartsnamefilt:33,observationrescalesizebyfactorfilt:33,observationrescaletosizefilt:33,observationrgbtoyfilt:33,observationsqueezefilt:33,observationstackingfilt:33,observationtouint8filt:33,openai:[30,40],optim:[7,11],orchestr:37,ouprocess:31,out:43,output:34,pain:52,parallel:52,paramet:0,parameternois:31,persist:24,plai:54,planarmapsobservationspac:38,polici:[7,8,10,11,13,31],predict:4,prerequisit:45,presetvalidationparamet:0,prioritizedexperiencereplai:35,process:52,proxim:[7,11],push:45,qdnd:35,quantil:25,rainbow:26,redispubsubbackend:36,regress:25,reinforc:[47,51],render:54,repositori:45,reward:33,rewardclippingfilt:33,rewardnormalizationfilt:33,rewardrescalefilt:33,run:[41,45],s3datastor:29,sampl:52,scale:43,select:52,signal:41,simul:52,singl:54,singleepisodebuff:35,soft:12,solv:52,space:[38,52],starcraft:30,statist:41,step:21,store:[15,29],structur:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26],suit:30,support:43,sync:44,synchron:43,task:52,taskparamet:0,test:53,thread:54,through:54,track:41,train:[1,2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,20,21,22,23,24,25,26,54],transit:[15,28],transitioncollect:35,truncatednorm:31,twin:13,type:[28,43],ucb:31,usag:[45,54],vectorobservationspac:38,visual:[41,54],visualizationparamet:0,vizdoom:30,wolperting:14,you:52,your:52}}) \ No newline at end of file diff --git a/docs/test.html b/docs/test.html index f95618f..0798236 100644 --- a/docs/test.html +++ b/docs/test.html @@ -439,7 +439,7 @@ given observation

    -prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.array]
    +prepare_batch_for_inference(states: Union[Dict[str, numpy.ndarray], List[Dict[str, numpy.ndarray]]], network_name: str) → Dict[str, numpy.core.multiarray.array]

    Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc.

    diff --git a/docs_raw/source/_static/img/algorithms.png b/docs_raw/source/_static/img/algorithms.png index 6c00f21..0849ad7 100644 Binary files a/docs_raw/source/_static/img/algorithms.png and b/docs_raw/source/_static/img/algorithms.png differ diff --git a/docs_raw/source/_static/img/design_imgs/wolpertinger.png b/docs_raw/source/_static/img/design_imgs/wolpertinger.png new file mode 100644 index 0000000..e7f9b37 Binary files /dev/null and b/docs_raw/source/_static/img/design_imgs/wolpertinger.png differ diff --git a/docs_raw/source/algorithms.xml b/docs_raw/source/algorithms.xml index e6f68c9..377709c 100644 --- a/docs_raw/source/algorithms.xml +++ b/docs_raw/source/algorithms.xml @@ -1 +1 @@ -7V1bk5s2FP41O9M+JIMQ18e9NduZbCdp0jR5ysgg26SAXMC73v76CnMVEphdkLGzbB5sDhJgne/7dCQdkQt4HezeRWizvicu9i9Uxd1dwJsLVQVAt+hHannKLIZuZ4ZV5Ll5ocrwyfsP50Ylt249F8dMwYQQP/E2rNEhYYidhLGhKCKPbLEl8dm7btAKc4ZPDvJ569+em6wzq6Urlf0Oe6t1cWeg5GcCVBTODfEaueSxZoK3F/A6IiTJvgW7a+ynjVe0S1bvt5az5YNFOEz6VPh+fW+gL8GXH+p7HP5156+j3dc3mp4/XPJU/GLs0gbID0mUrMmKhMi/raxXEdmGLk4vq9Cjqsx7QjbUCKjxB06Sp9ybaJsQalongZ+fxTsv+ZpWf6vnR99qZ252+ZX3B0/FQZhET7VK6eG3+rmq2v6oqBcnEfmn9F16j+wXpz+ztSVzU0y2kYM7mi8ttYckilY46SgItdLjlCqYBJg+I60YYR8l3gP7JCjH7Kosl1e9jCL0VCuwIV6YxLUrf0gNtEBOP6jl2MvJpxsNiDTLm1ZXefole4LiqPZTKtMeds+AYP6jH5C/zZuBg2QFuNR/j2svwZ82aO+ZRyo7LLiWnu9fE59E+7pwqaf/SiDUzhj7v7QGCZOaPftLaxSMTXGEIifHs8qhSi1R9YCjBO+6ccWjoKigs+4ChXQ8Vsqj5i5R1jXVgYXoiJBT894LnKNwzpj14Rn60FceBqrDIB+rHAHJIsbRA701CYdxkWEQT0DHwoulkE0NErsIW0uHYSEYiXRU9N7qDO0MnnW2zpNOlcY5wDX6zLn+nIM9OTe0Rx7kYyjo9AyfPu7Vgn5ZJfsmyQxp78Sgwfh3S4oTb+K9Py9pAaBvdtXJ4iq/h5ttWv02WGDXxVFxVfrU2YXZm1Fz7QHkUd9SF3Df9x6mvo4tV2Oob4zEfMVqMB9AnvpAhzz3LVnc146Fi3vPdX38SBtrRkQNETaDBxoBC/Cg8njQZOFBHzk4PgMfADYILrnG+EAQBJfkHd0JhgRSQhEpb24+vKMnL52EtrOq3GHkDmEndULScD/j8ZCEuOHa3IR8bxXSQ4c6jfYZ8Cp1qecg/zI/EezlowVvbCzCQG4EgABDacg2FNBUEyBEXsSm/ewR24jxmdkzPrOmjM/MVsr3ZbgtInirbiyiUgNwGGMxvZtF/9gG9EZkm9AQL87uGm6D78hJB24xX/6AltQLyutjXNNeKPwgQNDHLJfYcCQN+lRWQVRL0NGropkWXZaEzBOxQ0TF6ikq9pSiYo0fRwhV5jMK754VN7w+/utqM4aA8C2vAYZo3gfKkgB7loABEmD3lAAwaWBhcxrwlfe673ubuC26rvkOxZtszXPp7VIMXLE8HZ2IYzDPZId30OSHd1C4xCGLdXBm3QDWaT1ZZ05JOmBO6WNQ83Dl70M+ZjxcOXwKHwPjHKIr0DlNcziUskSh1GXbWOpPFK56DtM47L2aeRlz4nkZYxjvwSvXdtB3VJXNSh87u6UctOdw06wD2S2ws7yk7BZ+0JfNz1BbgNFrXF/XtIkX2IsLzylHKrDYcFyzBSlHotUWaEjzjtraj/deXVFEfblDQgclrRMjLx2EncGYy26k9QHeyaZglVsaAeHYOX9nsKxpaKwPBLGRcGkZyPOCiGg/uRcaCbGmIMMSiDIsAZCW8WHNMeqQbK++6V5Z8DdZvld7wtcJLSjOC4Rt05S6LVgcMARhkaz1QW0eyg6Sib7zlNCYVCba8/9mmThBmYAaO7SdXCaKCHKWiZfJhN5TJrRJp7qhKC10lomTlYlmNGEIZsCOKxPDtpi8epnouyCmK2JcHEkm2lfEZpk4QZloRhOTy4Q2RxPH2R0+dP/ni9bPnr07XIFd5eWsn2nGDMH5BQWTvqBAH7a7YxoIqqeDQa1vZqY+iQyqjSVQ07QPyGBneUkYPJtVgiNAaShCDkLAMhurPRl081qNoGqMTo7PCp4kUBcG5WXlkN9uBOZwvmux0WSFQrC7/Lij/mF7jKbpyMzT6ciKPJgT7chgMatTdkwHginV7CovqSMT5V7JkLrxZiAGPVeXaM4SCUEzOXB6kTybASevdSewu1vvm49hqFJCOQ2wotY3lHu22AJ2MsTSDoitAbrKSxJbCa8ZEqrcvRceELXXvuHUMliZsy3BdtNipuEo20314YMOAERg+DjOZYqrxNsSOPWes2amnyhIMREu4vQj+0UH8diZ/vvTpeFr7MyZJdh3KdztLG1vjioJfoEX/jIBBGkrKB8FFVRxhV9nfLKRoAobkaAIoeZREdqeZVG68fMNlWfly0X6EM1OrfH6pkWrL/n9gCM0KLAbgXUxyVTfjCdI+i/W/MZvzh6v0RK+le7YDWexEaQhyBHXRB21tA0xZ7P+cJIjkhzPR9rEKNh12Nhh1Xx35EgDkuZuyOJF9G0DEq3xXM8tf+TdlvSwelV+Vrz6Dwfg7f8= \ No newline at end of file +7V1bk5s2FP41O9M+ZAdJXB/31t3OZDtJk6bJUwaDbJMCcgHvevvrK4ywEYiL18jgLJsHm4MEWOf7Ph1JR+QC3QSb+8heLR+Ji/0LqLibC3R7ASFQoU4/UstLZjGgmhkWkeeyQnvDJ+8/zIwKs649F8dcwYQQP/FWvNEhYYidhLPZUUSe+WJz4vN3XdkLXDF8cmy/av3bc5NlZjU1ZW9/wN5imd8ZKOxMYOeFmSFe2i55LpjQ3QW6iQhJsm/B5gb7aePl7ZLV+63m7O7BIhwmXSp8v3nU7S/Blx/wPQ7/evCX0ebrO1VjD5e85L8Yu7QB2CGJkiVZkND27/bW64isQxenl1Xo0b7Me0JW1Aio8QdOkhfmTXudEGpaJoHPzuKNl3xNq19q7Ohb4czthl15e/CSH4RJ9FKolB5+K57bV9se5fXiJCL/7HyX3qPacKwtY7KOHNzQWmmpLQLtaIGThoKIgTxty8ItmGPuMQkwfUZaIMK+nXhPPNhshtnFrhyrehVF9kuhwIp4YRIXrvwhNdACjH5IZdhj5NP0EkTK5Q2zqTz9kj1BflT4KXvTFnYHQJD96CfbX7NmqEByD7jUf89LL8GfVvbWVc9UdnhwzT3fvyE+ibZ10VxL/+2AUDijb//SGiRMCvbsL62RMzbFkR05DM+wgirYhKonHCV404iC/KzGuwvk0vG8Vx7IXKIsC6qDctERIafgvVc4R6k4Y9KHen3oKg99q8NRPoYVApJZjKMnemsSHsdFjkFVAjomns2FbCqR2LWxOXc4FoKeSEdF71LjaKdXWWdpVdJBaZwDlUafONfGpVbO9d4jH+VjJOj0dJ8+//WMflkk2ybJDGnvxKFB/3dN8hPv4q0/r2gBoK02+5P5VX4PV+u0+l0ww66Lo/yq9KmzC/M3o+bCA8ijvglnaNv3tlNfw6arctTXe2K+YpaYD1CV+kBDVe6bsrivngoXj57r+viZtt+EiAIiLA4PNAIW4AFW8aDKwoPWc3B8Bj4AfBC84xrnA0EQvCNv707QJZASiUh5e/vhnp68chLazlB5wLZ7DDtpiycl93MeD0mIS65lJtv3FiE9dKgfaZ+BrlP/eY7tX7ETwVY+avDGxyIc5HoACNCVkmwjAU1VAULkRWzqzx6xvT4+MzrGZ+ao4jOjlvJdGW6JCF6rG7NopwE4jLGY3uWif6wDeiOyTmiIF2d3DdfBd9tJB25xtXyLlhQLyutjXMOaKdVBgKCPmc+x7kga9EFeQaAp6OihaKZFkyUh00TsAaJidhQVa1SiYvYfRwhV5rMdPhwUN7w9/muwHEMgdFnVAF0074NkSYA1SUB3CbA6SgAYV2BhVTTga9Xrvu+t4rrouuA7O15la55zb5Ni4Jrnae9E7IN5Bj+8Q0Z1eIeESxyyWIcm1nVnndqRdcaoSAeMIX0MCh7e+7vNx5yH9w4/gY+BfpbRFWicpmkPpUxRKHVVN5b60w4XHYdpFey9mXkZY+B5Gf043oO3pe2g66gqn4QeOLtlN2hncFPNluwW1FheUnZLddCXzc9QW4Dtt7i+rqoDL7DnF55SjiAw+XBctQQpR6LVFqRL8w6s7cc7r64oor7cIaFjJ7UTI68dhJ3BmMsqpfWBqpMNwSq3NAKivnP+zmBZU1d5HwhiI+HSMpDnBRHRfnIvlBJiDUGGJRBlWAIgLePDnGLUA7K9uqZ75bHeSAanqD7ha0QLitMCYd00pWYJFgd0QVgka31QnYayh8hE13lKpI9LJurz/yaZGKFMIJUf2g4uE3kEOclEJ5nQOsqEOq6pbiRKC51kYrQyUY4mdMEM2Gll4rgtJm9NJrouiOUTZWORifoVsUkmRigT5WhicJlQp2jiAJnIp8/aBx297/981frZwbvDFdRUXs76mapPEJxeUDDoCwq043Z3DANBOFyKWNfMTG0cMghLS6CGYbXIYGN5SRg8m1UCGVDqGSGtEDCN0mpPhmVWqxRU9dHJVbOCBwnUhUH5rnJY3W4EpnC+abHR4IVCsLv8tKP+4/YYDdORGYN1ZPlo/lw6MpTP6uw6ppZgChpN5SV1ZKLcKxlS198MxFHP1SSak0QiUE4OHF4kz2bAWdW60+/u1rrmY+jwNKGcCnhR6xrKHSy2gJ8MMdUWsdVBU3lJYivhNUNClXv0whZRe+sbTk2dlznLFGw3zWcaTrLdVDt+0AGACAwf+7lMfpV4vQNOsecsmOmnHaSYCGdx+pH9olY8Nqb//nRp+Co/c2YK9l0KdztL25sDJcEv8MJfBoAgbQXlo6ACFFf4dcInHwlCVIoERQg1TorQ+iyLnRs/31J5Vr5cpA9R7tRKr2+a1fqyuh+whwYFVimwzieZipvxBEn/+Zpf/83Z4TVawrfSnbrhTD6C1AU54qqoo5a2IeZs1h/GMCJh8B1qE6Ng12Fph1X53ZE9DUjKuyHzF9HXDUjU0nMdWv7Euy3p4f5V+Vnx/X84gO7+Bw==7Vxbt5o4FP415/F0kQQCPOq5tJ1e1plpV6edl1kRojIHiYNYtb++QYkQEi+nEPGcqg/KzgXI/vaXnZ0NV+hmsnydkun4AwtpfAWtcHmFbq8gBDbE/CeXrDYSF9obwSiNwqJSKfgU/aCF0Cqk8yikM6lixlicRVNZGLAkoUEmyUiasoVcbchi+axTMqKK4FNAYlX6dxRm443Uc6xS/oZGo7E4M7CKkgkRlQvBbExCtqiI0N0VukkZyzb/JssbGueDJ8Ylip0vDPcH797f/fEwsd8FD9eL601n909psr2FlCbZL3f9o2e/S1b35Os/N29Xt70VfvsQi66/k3hejFdxr9lKDGDK5klI807AFeovxlFGP01JkJcuOGS4bJxN4qJ4GMXxDYtZum6Lhk7+5fJZlrJHWinB60/egiVZRb755C3EaFv8gKRBgSy47UxoM5cUd0HTjC5raj8wZmCrSG4BlE1olq54O9GLgEkBfiCwsCihBHEhG1dghASKSAHf0bbvUkX8T6GlJ2hMVRANOeCLQ5ZmYzZiCYnvSmm/VGE+mGWd94xNC8X9R7NsVYwxmWdMVitdRtnXvPkrpzj6Vim5XRY9rw9W4iDht1tplB9+q5aVzdZHop2sXbDVbn6bv6BbPlRsngZ0z5CigpZIOqLZnnqOHispjUkWfZcvrnW9I8VS2WBG0+/81CxpZrSSqamWGnh0MNSaXc3aQ0K9YSCZKzBpnRC5rxzJPrFqnr6jWic0ZZz2xTjbNk7nSOPEXRqno5lGccwvtz/gf0bZepw2gny+kyCC/58zUXA9Wyu5xysAZ7osC0Uvb5PpPG9+NxnQMKSp6JVf9aZj+WRcXLkAcxzhwQFaz+aHOcKhXmhLHIFNUoTl1SgCIJUjgINUkvBMkQQ+FVg+RGEY0wUfwQtMDsHEl0ACoaMBCVRBYpsCiduyY/5cFQNkB3xrlZJiNA741sxb14xnwHyRznxvbx9e88JekPHBh9YbSsImdsw1k9UwIcEgYQmt6bsQkTgaJfww4JrkUw7q53qO+BK7VxRM1kSzA4SyfyPh0BRqALZqrI80Bm1rYGPMM/Rfumdo2g8URHDQEQSwS09QXKaGHo5lA19HBjs5ZpBu+YImM6qngnrVj/MJPxGbZ9ybnG3Omswn/5IgX0zO1PoHeKda0dwkFbr+wFIXIZpJajikODjlQhTKbAN9zSwFtWEiUzAEiiZeGN+cfiUK4LEMZHfKQLB9B0VLSZ9J8uZJDsmFLNbk4MC6c4LQK9U9wbrAFTIFGnThi9b5wj6WL/xO+cJW+OKrCoY4jqazXS5+RaVkNt3s4g2jZQ6NvmzTrRutMSt15YUnctUpHZ10RncuFtq2heJjDdTSY+VEBoq71Dyo6L1EwSHNS3ovYXA2mgfu8/DldGHArS932G/zdH5bb9cq7y+SjI5cQCqI/L2jS27H0SXcLLy0ZoP5ZCrqF5o5+/mCpFkvTxEqL3ktu4/y0VvXqWWX+L0bt39/Cn7xj+QX22vIL+umfBTIqlJhyqIkm1V6fsgFFe/GkwFse7XcoVp94FuN6mNrf/369dTq8z+bOywtZDtUDbZYmq15ujWaFwF+3K1f5SuT6zRlXDn8XMFvnVUj27rdcVIN1IXUm2jmhSUpQqvuj2AANAElBHQuiXBd2s+JcZ8xvZr0SUzT7zbN+mA+Y6c5U6LjBjtlwNKtbwKWBCTbGZn+1cjWMw1kIVAPN7sW1rCDq0mBMkfpajarycn2fNNbalFGz3I0qgEYqLrBxnj7spbshreRdyRv2ztAdaJE9N0JUO1mODwhn+HANmSj60rUXAlwyYfYT2ueX5tzPKjzSPU5EaaIzW7B4QA6hPzZTjf8l0xyBCSDWf6zuc6DG+x7vZvfYsWKgLxi9Xxft5+OT+jg2DoHp8ZvX66UpM6nZG8aM15QX0564pHPanjbVkfThoZG02229/lyXZKuInz2sa4KbOqqtBLexv7+8DOy/X31zYSfbROJ5JeHhtpgQNGPgAP0Na6LpxKgY2o6cZs9WHghwNYJ8Ngtjm4IcPvWhrMmQHX/5UKAZ0GACIP6jgJ0teu3U5Kg03y/9kSMp+HaswgwHclZdtN81FacNnAgZ8CGe+sbyhm4bGqd10TsHJvDiZom2jTjrstD32c3y0HhVYt9MREg7Oyhb+xd2OWs2EW8T8Z8Gl8z3KgZMo/XHz9yyeeUch6wuOIf59NmVt5R1kzeiQC0COAbf4DT0b0kRLdZYczZxeqE8TklURIlI0WNamy4zZxqWQG7bTEmAxr3SfA4WreXtp/yj0EmB9iurVecrp/3d9XNpjzv/qI9jfHhejKya0p3/LB8X+NmTVC+9RLd/QQ= \ No newline at end of file diff --git a/docs_raw/source/components/agents/index.rst b/docs_raw/source/components/agents/index.rst index ca21713..c958768 100644 --- a/docs_raw/source/components/agents/index.rst +++ b/docs_raw/source/components/agents/index.rst @@ -21,8 +21,6 @@ A detailed description of those algorithms can be found by navigating to each of imitation/cil policy_optimization/cppo policy_optimization/ddpg - policy_optimization/td3 - policy_optimization/sac other/dfp value_optimization/double_dqn value_optimization/dqn @@ -36,6 +34,10 @@ A detailed description of those algorithms can be found by navigating to each of policy_optimization/ppo value_optimization/rainbow value_optimization/qr_dqn + policy_optimization/sac + policy_optimization/td3 + policy_optimization/wolpertinger + .. autoclass:: rl_coach.base_parameters.AgentParameters diff --git a/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst b/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst new file mode 100644 index 0000000..5aa57d2 --- /dev/null +++ b/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst @@ -0,0 +1,56 @@ +Wolpertinger +============= + +**Actions space:** Discrete + +**References:** `Deep Reinforcement Learning in Large Discrete Action Spaces `_ + +Network Structure +----------------- + +.. image:: /_static/img/design_imgs/wolpertinger.png + :align: center + +Algorithm Description +--------------------- +Choosing an action +++++++++++++++++++ + +Pass the current states through the actor network, and get a proto action :math:`\mu`. +While in training phase, use a continuous exploration policy, such as the a gaussian noise, +to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid +action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the +critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen. +When testing, the same flow is used, but no exploration noise is added. + +Training the network +++++++++++++++++++++ + +Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed +through the k-NN tree. It is being passed directly to the critic. + +Start by sampling a batch of transitions from the experience replay. + +* To train the **critic network**, use the following targets: + + :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))` + + First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`. + Next, run the critic target network using the next states and :math:`\mu (s_{t+1} )`, and use the output to + calculate :math:`y_t` according to the equation above. To train the network, use the current states and actions + as the inputs, and :math:`y_t` as the targets. + +* To train the **actor network**, use the following equation: + + :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]` + + Use the actor's online network to get the action mean values using the current states as the inputs. + Then, use the critic online network in order to get the gradients of the critic output with respect to the + action mean values :math:`\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }`. + Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights, + given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network. + +After every training step, do a soft update of the critic and actor target networks' weights from the online networks. + + +.. autoclass:: rl_coach.agents.wolpertinger_agent.WolpertingerAlgorithmParameters \ No newline at end of file diff --git a/docs_raw/source/diagrams.xml b/docs_raw/source/diagrams.xml index 15f067f..9b5e64a 100644 --- a/docs_raw/source/diagrams.xml +++ b/docs_raw/source/diagrams.xml @@ -1 +1 @@ -7V1bd6M6sv41eUwWQlwfc+l07zPdPdm7Z87MnJeziE0cZmPwYJxL//qRMMKgkgHbEoa0MrN627K5mK+qVPWpqnSBb5dvn7Ng9fwtnYfxhWnM3y7w3YVpIux75D905H074tj+dmCRRfPyS7uBH9HPsBw0ytFNNA/XjS/maRrn0ao5OEuTJJzljbEgy9LX5tee0rh51VWwCMHAj1kQw9F/RPP8eTvq2cZu/EsYLZ7ZlZFRfvIYzP5cZOkmKa93YeKn4m/78TJg5yq/v34O5ulrbQh/usC3WZrm21fLt9swps+WPbbtcfd7Pq3uOwuTvNcBGG8PeQniTcjuubiz/J09jeL3hPQIdIFvXp+jPPyxCmb001eCPxl7zpdx+fEiDtb06Rvk9SxdRrPy9TrP0j/D2zROs+Ks2Jl54eNT9Ql7zpiMPEVxXPvmPAi9pxkdT5O8FBbTKN/XvmcUf2Q8iKNFQsbi8Cmnb7NZeZRD3sFHVD61lzDLw7faUPnIPofpMsyzd/IV9qnllfi9M7llEv+6ExeLCcVzTVSwUz7foBTRRXX2HUzkRYmUGDXHUwjaPFg/V8e1IvjJof/rg2CpA+RCWTCPwh1qSZqEMoB15QBL8GngahqOAXG1BLg6zDacgqtvaFxzSUi6TQ2lNhogiUVIIksGkgggGZB5ahjL6rt3hutKxg+ZLfjJAMy3rmwOMse/8vDuzwYAVhpTB1AKfma3JobJ/Jr6GuTdjEJDAanjVc3t9AnFwWMY31TeQe0h3hT/E6PV8/GTp569/5NeiTzC8u2/9s67hnt/fdMGWh5kizBvCnI4b3hMEMYaKrYAFDaWhXGQRy9NP0uEVHmFhzQid1dJSWV7mYxULiM7xzrdZLOwPKzu+YAz2by8YYM71/ZBgHMR1IP32tdW9AvrQ24aG633ZvvuiQcgu/0AcEvcAeTF9lfutKXCupcCuT0USE9lPacyzinBfacykxfoY0whUulsDg9eqyFtCJb0sMFs6lwVTtZgdBxRzODLmNE0ikpQRH5PFC1+pjpKGZ1fHkZJoR7mZ0CBNmIsMqoy/Es4O27WIRlIwvw1zf4s+C76C7NwHtG4wSDRQ5Qm5MU8IgBEj5vt27HGEwMGDthpAimaHZXFCZA9+3uBY/5M/w3fVuQnByVyqzSOZu8M2nWwXMX0S09ZuqyOKAEnEGqku5F2B0TaAkjPsjDIKWhBUlfQKHmi+KaP/w63mptQMLfqHeUV/Hma7QSlOnj75SApwov5PKLDQbw7a/CYbnJwmJYNIBsWHlA24KRcaXdAY16CaqHjFcybJHpKs2X8zut/9Y11AZh5S8fiNFkUv4T8EPIJuT9qHcKnYBM3poZSRorbWGupqIF7DoMBw6adwaB4E6lYR0we1lEB8VYIyLPPmYmYFy+YnZlX30nCVzrBJC9UvsL1Kk2ogRkr5COgF40m2eNbA4oC80g62GDyU3KOQmw8e/bUag+5HGKkw4w8oJCM39AHE82C+Lr8YBnN5/QyQiHYiYnRl2jpy4YU3yt/oWBx8lSFNgW+npgIkQCipoQbs91IGGFsN0UCORzWvQlhgzsRH69LYoP5G+7idpEHaGpObE9jalGPDIBfQq7NjynXyOfkh2cTzyTXtqFYrmHElm2KOPw5KIKxYoz6YVFSeF8O8dmJrCSP61UlMpOfnVuVSfbsjPlFd8MF0zMScmoy5me7F97PxAffrDTaElxq1ANtUxXaLkB79pym6xZa5HRIyzW9j+pc8xyKKQiXK69IOqBwgSp9XIfZCwxrp6mb54yTPAGSniIg2Tkm4k++RXnNnSTv/lXeg2RPc+uuXdQjqLrzyfJwR+J98kGH5drHeZ+8V+jwIibJ++Rv2Ebt3id/X9z3T/Y+K5XVWrBPC9AE1ICLnSyeDTxWDVysRg34oOrsagBJM2654kPM7cNGWbbPZS46AhqUGWzp07umixgMozJVvCZ7xybGYq62xT85K7YhWgfaD0jidKx5aQPSw4BwomJC8yFaCpNiPiBL00yF2C5ssiXPKKk+mG2y4jfvPnwJsih4jPW65wHrnsgwINjKFj5NyNIE83kFKTEQWZTQ6YEq9GuQFR+l1ef1NXINcV/WdVeNWodYtCwqBeIe6eEf1h+ohzPMsjWWlNxR+wjIOD6esa6c2p9rNX2GgdZOkeF11Ln47QecHODgDxLnSxL+0Qq6e7Sgc2caavm0U7T55ZfDDwDNATp/uylZeWAsSaZZ6nyDfEg+q7aZMZdu8lWRF8sOH6u3cEbPzx4y4w3DuO1XNIsjozNtxM3aTQGRZSOdk8nNExgDDANMzqbwBAJnS6JkGqbk/IEHb2E8b0gLI0gAqDEE4eqiZA/CVbRO50UCCEv6WGwrc1mYpNmjw9gjnlGwHQA7UkU/YxhuNvmjRl59jT5ahsu0vJRW5548gqD5jTIeAfcoUP4VPAZvVB6DxcuEeWwa6mCBFAj5zx7mMPWoifa80O57Pf9ImH9ElduuQCVkTD/W2drljab+XlEbBVHBpriPgpRqXhifclHCzo9YgxBhlTVr9XWocPAylI3P1tvLghFiUQTwQYzxOfNMkWV1waoq7dTq0RllRP6jurwU2xyVBwm8K8sQisipaSrI8lvPOyQHZQlqv9OEatWe6YVcs+jucEE7COczWubPmG49p/SfU8QSMMScwpRO1AIijopy/vQJwN5o/aAJioMBt9zzAQ7px4GgG00gIKkRF5+5jxwB7STsxCWjLZ4Naac4DLJkp52lQZbuG37wYrKKXa9gFawOIFMRTexAymWzmm/TBktnqRbAaZPb2+S65+vJ7Ewq1ebUErHygY3Fi7f8phwcn17On8nnZWNIT53txjANmTpPsWIJz6hjTGKUHMdDlmMb2OMdCg9JiTj5BjG7zSFUp4J5VvsKhu2ceoDV0S8cRvGS+4U7kJSL05Qu4j+llJRNNsuLerOOYomfxlB5tBQ0u5smdTfoOgqQAUfAwKsqA3FE4ZMTU8zm0UsDS+c/G7ob0g2F9LJE55qG1dR/rj4lr8qsju1Z1ivalXM7FqcEhXKc3Fj9o9pwcV02qsXpUHHim0cgT9QYW5E4udDd19uttMJl+i7Y/uJ8rr0LXXvxkoxGsA1ByzwfgioXxn/x/T5swcSsbusqFy6Na1t6qCba3vk0EXrSIX1dkF8fwq8ZlscUgNuZu6BqkduFi9zlPMk6x+u2wseBzIX2WFASr6xzoesCzCixwn5gCUbT4NafWg3ZHUPk1ikiLuXAhRzRwRxWkOWMFStvsBi7j+jPLC8o4M3IYPkVerv/DvP8vQQ82OQpGUqz/Dld0F0PvtL4vy6weyizVkk5kfxiU3Gd/HL7FvX0ZrV6t5SHYQ6UnPGwlepyXJiPMhL+0bI4jw1UQPXfldDjZxtXUcc0eNMdHKDD858HH8AqzPb/et7zbR5wMsvoTat50TnoflekbuNajLJsTkyq7bgPVjf+TI6ixgVAslmVQm/dOfgA3KGd4JawZErfg85qtFxlqe7oenGMi8pvfWF00nWqohDvg5RqqbSiIr/RHJcV5QXKObb+2+U7/4MzKXNZuowi+I2HHmB2NZjhb8mU3GDGF7ksqlbKqub4Z10r++iZcYKd7oekG/wevTkmlLp6CDu/S1712iz5AcmrBl/HZmBYDW+JdkGwWI+Uk7CEZC95YitaQLAMkmAhqEobn681BqXctUJrbWsgmo+l6OTZ9nV2Zl74+NRHJ+dB6D3NZCgfkrTwQjDiQEOCTshYlHWAeIbmKNRgnU8cvoSxUu2TOzuOQ/dA5wnBiqc63esRzqjRPc98xE6v+XBuh97cOl735KDE5VUiQUa/J0joZ609TttCU5T9rcoPJqpLnfyP5AaPQ9H53hyIcQkDJImRKV4reh+UEERpSEWHLq1W9KkpOqgbGlbRz+ZNT0vRMdjy1oUoqVP0IXPAtaKrUXSMgKJ7Ayp6j+bzOmzuDptNUcm1srCZuIEANh03y4ibTVGNtaq4ueJa9DTbBRNnI7GA3lA2zSIdOH8ATedDMowH9KeRqRX9qMB5WEUfcqVYK/pAgfOwiq4Zsl4ogcAZmwMGzkgzZNNXdBA4Yzxg4IzOxpD5jouDXooeIqLq7plndJvPx0eWYIkZsTxi+bo+JEkWJi9RliZLrfEqNB65GMoSnDYQm0rkK71my47BzeYne9OCqInZMhkdChHSbJkU/bP5VD3TEWifMrbM1GxZT5h4x0hgJJX51qZmyz6ApvO5nNgc0Ldmeq0VvR0lvtZiYEXXbNn0Fd3h9wIaVtE1W9YLJb4+jCg6REmdomu2bPqK7jpA0d0BFf1sgfO02DLHAAwHFmYkqGLLTBgqa7ZsmhrvIMC8YkEOqTq2DMN4HdAuxksUvmo8+9kGzgmwBP1hke+qQhPG9aBQUqPZn8vGCHDZw+IJfW9NisogRZHAN1dHimKRc65DqG5SFDkDhlBYtFqsQ6iJaTpPtyF3wBAKn22/tGkpOiBFh1V0UaCrFX1aig5I0WEVXVfT90IJkKLIgSgpU3RLtBitFX1aig5IUcT6gw+h6JZopXwQRa+eaP82Un2bBe7ShxwlYTO/abVrwKAZC6Mtfq+x4zAzAWaf6izmIPANv4GtYlSBHrqsLVeDChHYcjnllBZcyr5eiPDUrb/2pGIDqtkVJPm6glULOZYUkiBRstrQLpMEilxAZU3WgarkR4o19TnUfFFqtmiLYRn7rCALhrTpJte4deubD8jjgZHrsZ47aAPjVgz2tSmG9vvgfRQq0zOSnsQEcq8hGHCP1b5NiRHY08Xo2dv9iCa9yIJRb/q4DrMX8jTShAU5j5kwsqLAX64LSaCBFXJXb9vIiIutsvA1yOaSTnawc0CE6774a/HTBnYTkNuiNk2FM5Ek42ViXkIFLrxoczYproItitvHa7n2tl4/sb16NfeOxW5ZtiXLbnWfSqLdsmEQH8yONFnaxEgyMVVn6POYGBtaFLm7kXXuvTDhzch6yMvxVs82RS5cX1MofTuy6n4agSvRSvK86Km3HIQRpwvytIcJhU4rNGwFT1EI2/QmdssCdUKQ9V9pUEdylB0rUPaWnQe1rvf2cAQ7yFSD59D1HsUOI/I8dyJngqB5v6SeDJotAI1ZybH4qtg3wP4jx+5XKDqZwVsmmf5qD+aGSNmKvoyWAX3glYP4lQrcQ7qOCv8W3z2meZ4uWyWx8jp5JzOnFuImWK/CGUXnKXqj5vGmuOQ1GzXYCHn9nOerdeEs35P/L6L8efN4RSwpefOd0gRJ8ON9nYfLNRmYpcHsmfw3C17pAwnWBZV4Hy0X5N8gXqQZOXy5vlolCznTkMf1EdxtIt1YixbQcXL4ONtXPAt9KI8TcbNQ28rXqQaNVcg3DJqHZM9CxaGH7gYG2l+aruO0WiuPWzDgD7g4dbMuxEJ1LcnTkGRjopLs4/YDJEgy0pI8JUnu62SOTpI95TZZEKhoSR6vJEuPcQeSZGSwJHd1oqyartGiLFWU+66uy6drHEtLypQkxZqq0at2AVBn9FQsSGlRVibKzmRF2VI+fztalKckytKXVgcTZddTLcquFuUpibI/VVE2OxyMHkd4ZtcRrDBmzxES1MXT6jIldfEmqy6Wcj4N5jr/b/HSNP66yqNl9LNMeubkfYzFD62SJX8XJ4NHE8GCTSTK7JNSJ+bqxamR2iDXEtggZyRTtsVPp04nEcpbLUe2DXL16tSURNk9X46qq1d/piQp/khW5A83ejbLYlJn9PTqz5RE2Z3s/G07hx7hsp0D1Qk/zD8WNznQYQa2ePjMQcMMvV4zVjPlimbckWRzAjPVSXVg3kxJpzpcvV4zKVGWnm8xlChbLKBVJ8qwW8lDGkezd03bHTyfDkzb6aWDsRohNR4+NA+O7V35NtEaY/svn6fg2VeeTXxwc/uv27zAnvqrg80acPLdkufba9YcPibmjjjdrLEM/ppZu/v9+0BG7MRq4yGNmG/wy0IetGGiqkMpJoxVDNVQ+v2PSw2UqBaPd2o9wb7grjKgTADUTZrm5LEFqxWBpdYVZNvjo8CQH9SoAvXDdjeqyBDBasmAFXaIvNPaJ1yiRRZfzzyspYQ017dvtxooCBSoGyJOEKxaVgcUbNr5/ZMGSgCU5XSbPnUwwa1Lvl/+yMMVnLV+v/waBlkSJQs9o/XAFRlmN66+CFdbBq4COuP6qwauD3A2tJwAOEvQ7AG3BK79gYMdPL5f32vgegBX9XdrA06dJYVpf7dBHtJuJDPa70tHBkeqo8+xOYPaUV/An9w/aOB6AGfzbRpFtLAydfQhpVLx+Z/L7RI0jH2sqs+XyrAYrNGLXLRlsRQcTYCJZvdHwe77omZu/kiqG8GCYWeNC2aZkMpqXHxdcz4eUZ6egFqA+W4T0D5XRBx32r8HNyTh/eapJPY09CGTdD3LCdKmcZtFOe2dq6fxzmncwpx5E03jbCld/iwOaaa7u4fPGrkeyGG+7J4EQEMyuT6kkm7jaFUsdxkPD3/VIPYB0eVBdLEAROYiyAcR0koaur6WE65NDqh/Jlv11G7jCNzGegRkGoIIyDTOVstlGrrqb0qSwrJgzx6KoCbDg42ymGR/rGw6rUecHCubbPFMi/IkRNkYC+1zsCibCKkWZV2WOClRHklrk8NF2e44QoIoawZzUqI8mgrbQ0XZ4WIuBaIMWcXPJJ59/hYkwWIiWxMPzUPxKGLBQqCqMp/KxainXgdUWU3jj68aLpg34dhcpi5GBgRMmDkhBS9IG37RQIn0ii1xt8HENsmSDxMkBr9t4jy6DHT3ADEd6PIZ8ELAWAKmfMBgntmPMH66XMXBu4YLZiFx25wPawQRTB/7lKzD5SP5gRorkGrE6nBaFUuUeSsFK5gx9uX6Vi+R9JnCTOhrsPZ8QyyRsA1ga9Ddb3T5lsjX4DVsUJhgmR1trZAmlyyRQwMGFh85wExTMH2p8jVYglDdOfz644vGCSqW7XTjxHbako8TJDO+XW8TbDRSPfx3gRFUp1OQx/j77Q0diIILWsEC6+6Yy7jWzkg3wL5l8hXKng/xFbmRMjbrNZHEjR9+RSZ8/8LAqQnrrHlDcxF+6JXLlrbQNpdV6xvcZfom6Pr8Rr4eKzKWn59rIsgg3W3CuKgTNrZtFJxgSS1P8rheFWjxVuw1ImJiGg+f/tAWrscUZoCMNIGB85UZOF+egeuzCGj5DeN3aVwZ9Ol8YAtIf+pDmEUEmzDbK7AHGj8sMn4jWes2+A5yXtkRab+tZMZ8zxFdxRCGa18h37Bcp/y3eTbXtq9qn5pHm+G2i3hW20UkWujKxPDZ94YmuPqmb3NFkD47SRdvwqF6lMU1Re2JCvzKifOLnjj7wWg4fGwgAtIRUcxSgJTY+lnHBjJjA1ag1Jgex1IrCEpTfdTRWNIycOsRXdMj5htTeu6RoQjCnnlFgnKEXMdyDXBey73afegxL0XFJCixTazWPfW6N5Jt7g/XPYz91iM6dY8qDK993tXRtboImALMzq9AzTD0NW/C5+AlSjPaBce4jVPdQqynu+IgTpJErRaVtd6oGuhoizlYTm/DBLL0SuXts2Hjap4u7G1tHKe9E7c/0k7crsN3uZHdiZs1Q68Zxt+WUa43Fdhj+6we3aPUZRsz3asXvf+m+yf2Qc7l+xWIZi22obR84GDa8R9BlDySuUCD1w0e4rt9+QYS9SlQ1G/PxHozj/F4I80FBF/kokgvdj9+9dT1r5zdH2+DbOcKgU8Pdm+Q7QpOU1tlbbmIvCDr5ufd9/T+t/88/e354eZrtP6U/vN/Lk0Wdmp3XbaCFL//7q26WfruvXzXy5Wveuicm82w+J2nu7uOmVbrEd1dx/grHt91zOLJQzZbDKViFkznv77Vay+9PAsLdkASlDkp8uf3wKn7pQxvS8mj2ZlS+ub9om5XD7SynR1J2xX5/PaYzwnossc22FbgQHvs8Vc82h7bGOxcoawL5Nv/3zyns/zP+PEv69mPhff5xx+Pl1W0UK+F0xkNvcyxjc7a1UyMJ1NLpQZZG13e6O6ZnY4yyBO0qCD57OS+utwqdn+LCm5FnYdrfwujIHh/uV78w/tp/Uz+D8V/EVrUvxED9mtYVGA+BRLe4uBisDWaMEvMEVhU83SLKsZToov7K9rNk6wjZwo7WYI2jTxD3kN7BpGoKfiJOUf8FT2XczQO6E/Oeaa+yZ3qaDtK3mZpmte/XrQNSuch/cZ/AQ== \ No newline at end of file +7V1be6M40v41uUwehDhe5tDpmd3u3kz37Dez380+xCYOOxi8GCed/vUrYYRBJQO2JQxpZffpsWVzMG9VqepVVekC3y6/f8yC1fPndB7GF6Yx/36B7y5ME1mmQ/5DR962I66LtwOLLJqXX9oNfIt+hOWgUY5uonm4bnwxT9M4j1bNwVmaJOEsb4wFWZa+Nr/2lMbNq66CRQgGvs2CGI7+Ec3z5+2oZxu78V/CaPHMroyM8pPHYPbXIks3SXm9CxM/FX/bj5cBO1f5/fVzME9fa0P4wwW+zdI0375afr8NY/ps2WPbHne/59PqvrMwyXsdgEtgXoJ4E7J7Lu4sf2NPo/g9IT0CXeCb1+coD7+tghn99JXgT8ae82VcfryIgzV9+gZ5PUuX0ax8vc6z9K/wNo3TrDgrdmZe+PhUfcKeMyYjT1Ec1745D0LvaUbH0yQvhcU0yve17xnFHxkP4miRkLE4fMrp22xWHuWQd/ARlU/tJczy8HttqHxkH8N0GebZG/lKJeBeid8bk1vf2w687sTFYkLxXBMV7JTPNyhFdFGdfQcTeVEiJUbN8RSCNg/Wz9VxrQh+cOj/+iBY6gC5UBbMo3CHWpImoQxgXTnAEnwauJqGY0BcLQGuDrMNp+DqGxrXXBKSblNDqY0GSGIRksiSgSQCSAZknhrGsvruneG6kvFDZgt+MgDzrSubg8zxrzy8+7MBgJXG1AGUgp/ZrYlhMr+mvgZ5N6PQUEDqeFVzO31CcfAYxjeVd1B7iDfF/8Ro9Xz85Klnb3/SK5FHWL79195513Dvr2/aQMuDbBHmTUEO5w2PCcJYQ8UWgMLGsjAO8uil6WeJkCqv8JBG5O4qKalsL5ORymVk51inm2wWlofVPR9wJpuXN2xw59o+CHAugnrwVvvain5hfchNY6P13mzfPfEAZLcfAG6JO4C82P7KnbZUWPdSILeHAumprOdUxjkluO9UZvICfYwpRCqdzeHBazWkDcGSHjaYTZ2rwskajI4jihl8GTOaRlEJisjviaLFz1RHKaPz08MoKdTD/Awo0EaMRUZVhn8JZ8fNOiQDSZi/ptlfBd9Ff2EWziMaNxgkeojShLyYRwSA6HGzfTvWeGLAwAE7TSBFs6OyOAGyZ/8scMyf6b/h9xX5yUGJ3CqNo9kbg3YdLFcx/dJTli6rI0rACYQa6W6k3QGRtgDSsywMcgpakNQVNEqeKL7p43/CreYmFMytekd5BX+eZjtBqQ7efjlIivBiPo/ocBDvzho8ppscHKZlA8iGhQeUDTgpV9od0JiXoFroeAXzJome0mwZv/H6X31jXQBm3tKxOE0WxS8hP4R8Qu6PWofwKdjEjamhlJHiNtZaKmrgnsNgwLBpZzAo3kQq1hGTh3VUQLwVAvLsc2Yi5sULZmfm1XeS8JVOMMkLla9wvUoTamDGCvkI6EWjSfb41oCiwDySDjaY/JScoxAbz549tdpDLocY6TAjDygk4zf0wUSzIL4uP1hG8zm9jFAIdmJi9CVa+rIhxffKXyhYnDxVoU2BrycmQiSAqCnhxmw3EkYY202RQA6HdW9C2OBOxMfrkthg/oa7uF3kAZqaE9vTmFrUIwPgp5Br833KNfI5+eHZxDPJtW0olmsYsWWbIg5/DopgrBijfliUFN6XQ3x2IivJ43pViczkZ+dWZZI9O2N+0d1wwfSMhJyajPnZ7oX3M/HBNyuNtgSXGvVA21SFtgvQnj2n6bqFFjkd0nJN77061zyHYgrC5corkg4oXKBKH9dh9gLD2mnq5jnjJE+ApKcISHaOifiT36O85k6Sd/8q70Gyp7l11y7qEVTd+WR5uCPxPvmgw3Lt47xP3it0eBGT5H3yN2yjdu+Tvy/u+yd7n5XKai3YpwVoAmrAxU4WzwYeqwYuVqMGfFB1djWApBm3XPEu5vZhoyzb5zIXHQENygy29Old00UMhlGZKl6TvWMTYzFX2+KfnBXbEK0D7QckcTrWvLQB6WFAOFExofkQLYVJMR+QpWmmQmwXNtmSZ5RUH8w2WfGbdx++BFkUPMZ63fOAdU9kGBBsZQufJmRpgvm8gpQYiCxK6PRAFfo1yIqP0urz+hq5hrgv67qrRq1DLFoWlQJxj/Twd+sP1MMZZtkaS0ruqH0EZBwfz1hXTu3PtZo+w0Brp8jwOupc/PYDTg5w8DuJ8yUJ/2gF3T1a0LkzDbV82ina/PLL4QeA5gCdv92UrDwwliTTLHW+QT4kn1XbzJhLN/mqyItlh4/VWzij52cPmfGGYdz2M5rFkdGZNuJm7aaAyLKRzsnk5gmMAYYBJmdTeAKBsyVRMg1Tcv7Ag7cwnjekhREkANQYgnB1UbIH4Spap/MiAYQlfSy2lbksTNLs0WHsEc8o2A6AHaminzEMN5v8USOvvkYfLcNlWl5Kq3NPHkHQ/EYZj4B7FCj/DB6DNyqPweJlwjw2DXWwQAqE/GcPc5h61ER7Xmj3vZ5/JMw/osptV6ASMqYf62zt8kZTf6+ojYKoYFPcR0FKNS+MT7koYedHrEGIsMqatfo6VDh4GcrGZ+vtZcEIsSgCeCfG+Jx5psiyumBVlXZq9eiMMiL/UV1eim2OyoME3pVlCEXk1DQVZPmt5x2Sg7IEtd9pQrVqz/RCrll0d7igHYTzGS3zZ0y3nlP6zyliCRhiTmFKJ2oBEUdFOX/6BGBvtH7QBMXBgFvu+QCH9ONA0I0mEJDUiIvP3EeOgHYSduKS0RbPhrRTHAZZstPO0iBL9w3feTFZxa5XsApWB5CpiCZ2IOWyWc23aYOls1QL4LTJ7W1y3fP1ZHYmlWpzaolY+cDG4sVbflMOjk8v58/k87IxpKfOdmOYhkydp1ixhGfUMSYxSo7jIcuxDezxDoWHpEScfIOY3eYQqlPBPKt9BcN2Tj3A6ugXDqN4yf3CHUjKxWlKF/GfUkrKJpvlRb1ZR7HET2OoPFoKmt1Nk7obdB0FyIAjYOBVlYE4ovDJiSlm8+ilgaXz3w3dDemGQnpZonNNw2rqP1efkldlVsf2LOsV7cq5HYtTgkI5Tm6s/lFtuLguG9XidKg48c0jkCdqjK1InFzo7uvtVlrhMn0XbH9xPtfeha69eElGI9iGoGWeD0GVC+M/+X4ftmBiVrd1lQuXxrUtPVQTbe98mgg96ZC+Lsivd+HXDMtjCsDtzF1QtcjtwkXucp5kneN1W+HjQOZCeywoiVfWudB1AWaUWGE/sASjaXDrT62G7I4hcusUEZdy4EKO6GAOK8hyxoqVN1iM3Uf0Z5YXFPBmZLD8Cr3d/4R5/lYCHmzylAylWf6cLuiuB59o/F8X2D2UWauknEh+sam4Tn65fYt6erNavVvKwzAHSs542Ep1OS7MRxkJ/2hZnMcGKqD670ro8bONq6hjGrzpDg7Q4fnPgw9gFWb7fz3v+TYPOJll9KbVvOgcdL8rUrdxLUZZNicm1XbcB6sbfyZHUeMCINmsSqG37hx8AO7QTnBLWDKl70FnNVquslR3dL04xkXlt74wOuk6VVGI905KtVRaUZHfaI7LivIC5Rxb/+3ynf/BmZS5LF1GEfzGQw8wuxrM8LdkSm4w44tcFlUrZVVz/LOulb33zDjBTvdD0g1+j94cE0pdPYSd3yWvem2W/IDkVYOvYzMwrIa3RLsgWKxHyklYQrKXPLEVLSBYBkmwEFSljc/XGoNS7lqhtbY1EM3HUnTybPs6OzMvfHzqo5PzIPSeZjKUD0laeCEYcaAhQSdkLMo6QDxDcxRqsM4nDl/CWKn2yZ0dx6F7oPOEYMVTne71CGfU6J5nPmKn13w4t0Nvbh2ve3JQ4vIqkSCj3xMk9LPWHqdtoSnK/lblBxPVpU7+e3KDx6HofG8OxLiEAZLEyBSvFb0PSgiiNKSiQ5dWK/rUFB3UDQ2r6Gfzpqel6BhseetClNQp+pA54FrR1Sg6RkDRvQEVvUfzeR02d4fNpqjkWlnYTNxAAJuOm2XEzaaoxlpV3FxxLXqa7YKJs5FYQG8om2aRDpzfgabzIRnGA/rTyNSKflTgPKyiD7lSrBV9oMB5WEXXDFkvlEDgjM0BA2ekGbLpKzoInDEeMHBGZ2PIfMfFQS9FDxFRdffMM7rN5+MjS7DEjFgesXxdH5IkC5OXKEuTpdZ4FRqPXAxlCU4biE0l8pVes2XH4Gbzk71pQdTEbJmMDoUIabZMiv7ZfKqe6Qi0TxlbZmq2rCdMvGMkMJLKfGtTs2XvQNP5XE5sDuhbM73Wit6OEl9rMbCia7Zs+oru8HsBDavomi3rhRJfH0YUHaKkTtE1WzZ9RXcdoOjugIp+tsB5WmyZYwCGAwszElSxZSYMlTVbNk2NdxBgXrEgh1QdW4ZhvA5oF+MlCl81nv1sA+cEWIL+sMh3VaEJ43pQKKnR7M9lYwS47GHxhL63JkVlkKJI4JurI0WxyDnXIVQ3KYqcAUMoLFot1iHUxDSdp9uQO2AIhc+2X9q0FB2QosMquijQ1Yo+LUUHpOiwiq6r6XuhBEhR5ECUlCm6JVqM1oo+LUUHpChi/cGHUHRLtFI+iKJXT7R/G6m+zQJ36UOOkrCZ37TaNWDQjIXRFr/X2HGYmQCzD3UWcxD4ht/AVjGqQA9d1parQYUIbLmcckoLLmVfL0R46tZfe1KxAdXsCpJ8XcGqhRxLCkmQKFltaJdJAkUuoLIm60BV8iPFmvocar4oNVu0xbCMfVaQBUPadJNr3Lr1zQfk8cDI9VjPHbSBcSsG+9oUQ/t98D4KlekZSU9iArnXEAy4x2rfpsQI7Oli9OztfkSTXmTBqDd9XIfZC3kaacKCnMdMGFlR4C/XhSTQwAq5q+/byIiLrbLwNcjmkk52sHNAhOu++Gvx0wZ2E5DbojZNhTORJONlYl5CBS68aHM2Ka6CLYrbx2u59rZeP7G9ejX3jsVuWbYly251n0qi3bJhEB/MjjRZ2sRIMjFVZ+jzmBgbWhS5u5F17r0w4c3IesjL8VbPNkUuXF9TKH07sup+GoEr0UryvOiptxyEEacL8rSHCYVOKzRsBU9RCNv0JnbLAnVCkPVfaVBHcpQdK1D2lp0Hta739nAEO8hUg+fQ9R7FDiPyPHciZ4Kgeb+kngyaLQCNWcmx+KrYN8D+I8fuVyg6mcFbJpn+ag/mhkjZir6MlgF94JWD+IkK3EO6jgr/Ft89pnmeLlslsfI6eSczpxbiJlivwhlF5yn6Ts3jTXHJazZqsBHy+jnPV+vCWb4n/19E+fPm8YpYUvLmC6UJkuDb2zoPl2syMEuD2TP5bxa80gcSrAsq8T5aLsi/QbxIM3L4cn21ShZypiGP6yO420S6sRYtoOPk8HG2r3gWelceJ+JmobaVr1MNGquQbxg0D8mehYpDD90NDLS/NF3HabVWHrdgwB9wcepmXYiF6lqSpyHJxkQl2cftB0iQZKQleUqS3NfJHJ0ke8ptsiBQ0ZI8XkmWHuMOJMnIYEnu6kRZNV2jRVmqKPddXZdP1ziWlpQpSYo1VaNX7QKgzuipWJDSoqxMlJ3JirKlfP52tChPSZSlL60OJsqup1qUXS3KUxJlf6qibHY4GD2O8MyuI1hhzJ4jJKiLp9VlSuriTVZdLOV8Gsx1/r/ipWn8Y5VHy+hHmfTMyfsYix9aJUv+Lk4GjyaCBZtIlNknpU7M1YtTI7VBriWwQc5IpmyLn06dTiKUt1qObBvk6tWpKYmye74cVVev/kxJUvyRrMgfbvRslsWkzujp1Z8pibI72fnbdg49wmU7B6oTfph/LG5yoMMMbPHwmYOGGXq9ZqxmyhXNuCPJ5gRmqpPqwLyZkk51uHq9ZlKiLD3fYihRtlhAq06UYbeShzSOZm+atjt4Ph2YttNLB2M1Qmo8fGgeHNu78m2iNcb2Xz5PwbOvPJv44Ob2X7d5gT31VwebNeDkuyXPt9esOXxMzB1xulljGfw1s3b325eBjNiJ1cZDGjHf4JeFPGjDRFWHUkwYqxiqofTb10sNlKgWj3dqPcG+4K4yoEwA1E2a5uSxBasVgaXWFWTb46PAkB/UqAL1w3Y3qsgQwWrJgBV2iLzT2idcokUWX888rKWENNfnz7caKAgUqBsiThCsWlYHFGza+eWDBkoAlOV0mz51MMGtS75cfsvDFZy1frv8FAZZEiULPaP1wBUZZjeuvghXWwauAjrj+pMGrg9wNrScADhL0OwBtwSu/YGDHTy+XN9r4HoAV/V3awNOnSWFaX+3QR7SbiQz2u9LRwZHqqPPsTmD2lFfwJ/cP2jgegBn820aRbSwMnX0IaVS8fkfy+0SNIx9rKrPl8qwGKzRi1y0ZbEUHE2AiWb3R8Hu+6Jmbv5IqhvBgmFnjQtmmZDKalx8XXM+HlGenoBagPluE9A+V0Qcd9q/Bzck4f3mqST2NPQhk3Q9ywnSpnGbRTntnaun8c5p3MKceRNN42wpXf4sDmmmu7uHjxq5HshhvuyeBEBDMrk+pJJu42hVLHcZDw//0CD2AdHlQXSxAETmIsgHEdJKGrq+lhOuTQ6ofyZb9dRu4wjcxnoEZBqCCMg0zlbLZRq66m9KksKyYM8eiqAmw4ONsphkf6xsOq1HnBwrm2zxTIvyJETZGAvtc7AomwipFmVdljgpUR5Ja5PDRdnuOEKCKGsGc1KiPJoK20NF2eFiLgWiDFnFjySeff4cJMFiIlsTD81D8ShiwUKgqjKfysWop14HVFlN4+snDRfMm3BsLlMXIwMCJsyckIIXpA1/0UCJ9IotcbfBxDbJkg8TJAY/b+I8ugx09wAxHejyGfBCwFgCpnzAYJ7ZtzB+ulzFwZuGC2YhcducD2sEEUwf+5Csw+Uj+YEaK5BqxOpwWhVLlHkrBSuYMfbL9a1eIukzhZnQ12Dt+YZYImEbwNagu9/o8i2Rr8Fr2KAwwTI72lohTS5ZIocGDCw+coCZpmD6UuVrsAShunP46dsvGieoWLbTjRPbaUs+TpDM+Hy9TbDRSPXw3wVGUJ1OQR7jn7c3dCAKLmgFC6y7Yy7jWjsj3QD7lslXKHs+xFfkRsrYrNdEEjd++BmZ8P0LA6cmrLPmDc1F+KFXLlvaQttcVq1vcJfpm6Dr8xv5eqzIWH5+rokgg3S3CeOiTtjYtlFwgiW1PMnjelWgxVux14iIiWk8fPiqLVyPKcwAGWkCA+crM3C+PAPXZxHQ8hvG79K4MujTeccWkP7UhzCLCDZhtldgDzR+WGT8RrLWbfAd5LyyI9J+W8mM+Z4juoohDNe+Qr5huU75b/Nsrm1f1T41jzbDbRfxrLaLSLTQlYnhs+8NTXD1Td/miiB9dpIu3oRD9SiLa4raExX4lRPnL3ri7Aej4fCxgQhIR0QxSwFSYutnHRvIjA1YgVJjehxLrSAoTfVRR2NJy8CtR3RNj5hvTOm5R4YiCHvmFQnKEXIdyzXAeS33avehx7wUFZOgxDaxWvfU695Itrk/XPcw9luP6NQ9qjC89nlXR9fqImAKMDu/AjXD0Ne8CZ+DlyjNaBcc4zZOdQuxnu6KgzhJErVaVNZ6o2qgoy3mYDm9DRPI0iuVt8+Gjat5urC3tXGc9k7c/kg7cbsO3+VGdidu1gy9Zhh/XUa53lRgj+2zenSPUpdtzHSvXvT+q+6f2Ac5l+9XIJq12IbS8oGDacdfgyh5JHOBBq8bPMR3+/INJOpToKjfnon1Zh7j8UaaCwi+yEWRXux+/Oqp6185uz/eBtnOFQKfHuzeINsVnKa2ytpyEXlB1s2Puy/p/a//ffr9+eHmU7T+kP75t0uThZ3aXZetIMXvv/te3Sx991a+6+XKVz10zs1mWPzO091dx0yr9YjurmP8FY/vOmbx5CGbLYZSMQum81/f6rWXXp6FBTsgCcqcFPnze+DU/VKGt6Xk0exMKX3zdlG3qwda2c6OpO2KfH57zOcEdNljG2wrcKA99vgrHm2PbQx2rlDWBfL7v2+e01n+V/z49/Xs28L7+O3r42UVLdRr4XRGQy9zbKOzdjUT48nUUqlB1kaXN7p7ZqejDPIELSpIPju5ry63it3fooJbUefh2p/DKAjeXq4Xf3g/rB/J/6P470KL+jsxYNqi9nBwMdgaTZgl5ggsqnm6RRXjKdHF/RntpmzrWPdM2xXw3HYUu+0ZRKKm4CfmHPFX9FzO0TigPznnmfomdyp5dvRu/fX192Q2u3+9fHj6/ePt459f34R29I80XhFrEiXDtRca1nwCWylQgRbzye/zIzKevijXQYLxFIMo0R3VxvNA48nZv05r2qaGZzCeCLgCBguv9lstj6//6jK5Nr9I7pfdYXqbXHCfnlOFgUeYXcT/AJ8/2dGGl7zN0jSvf73o15bOQ/qN/wE= \ No newline at end of file diff --git a/rl_coach/agents/agent.py b/rl_coach/agents/agent.py index 3db0aaf..5d12e0b 100644 --- a/rl_coach/agents/agent.py +++ b/rl_coach/agents/agent.py @@ -1003,7 +1003,7 @@ class Agent(AgentInterface): """ Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the - slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent + slave agent, define its observation, possible actions, etc. The directive type is defined by the agent in-action-space. :param action: The action that should be set as the directive diff --git a/rl_coach/agents/wolpertinger_agent.py b/rl_coach/agents/wolpertinger_agent.py new file mode 100644 index 0000000..a16b9e9 --- /dev/null +++ b/rl_coach/agents/wolpertinger_agent.py @@ -0,0 +1,131 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +from typing import Union +from collections import OrderedDict +import numpy as np + +from rl_coach.agents.ddpg_agent import DDPGAlgorithmParameters, DDPGActorNetworkParameters, \ + DDPGCriticNetworkParameters, DDPGAgent +from rl_coach.base_parameters import AgentParameters +from rl_coach.core_types import ActionInfo +from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters +from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters +from rl_coach.memories.non_episodic.differentiable_neural_dictionary import AnnoyDictionary +from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace +from rl_coach.architectures.head_parameters import WolpertingerActorHeadParameters + + +class WolpertingerCriticNetworkParameters(DDPGCriticNetworkParameters): + def __init__(self, use_batchnorm=False): + super().__init__(use_batchnorm=use_batchnorm) + + +class WolpertingerActorNetworkParameters(DDPGActorNetworkParameters): + def __init__(self, use_batchnorm=False): + super().__init__() + self.heads_parameters = [WolpertingerActorHeadParameters(batchnorm=use_batchnorm)] + + +class WolpertingerAlgorithmParameters(DDPGAlgorithmParameters): + def __init__(self): + super().__init__() + self.action_embedding_width = 1 + self.k = 1 + + +class WolpertingerAgentParameters(AgentParameters): + def __init__(self, use_batchnorm=False): + exploration_params = AdditiveNoiseParameters() + exploration_params.noise_as_percentage_from_action_space = False + + super().__init__(algorithm=WolpertingerAlgorithmParameters(), + exploration=exploration_params, + memory=EpisodicExperienceReplayParameters(), + networks=OrderedDict( + [("actor", WolpertingerActorNetworkParameters(use_batchnorm=use_batchnorm)), + ("critic", WolpertingerCriticNetworkParameters(use_batchnorm=use_batchnorm))])) + + @property + def path(self): + return 'rl_coach.agents.wolpertinger_agent:WolpertingerAgent' + + +# Deep Reinforcement Learning in Large Discrete Action Spaces - https://arxiv.org/pdf/1512.07679.pdf +class WolpertingerAgent(DDPGAgent): + def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent'] = None): + super().__init__(agent_parameters, parent) + + def learn_from_batch(self, batch): + # replay buffer holds the actions in the discrete manner, as the agent is expected to act with discrete actions + # with the BoxDiscretization output filter. But DDPG needs to work on continuous actions, thus converting to + # continuous actions. This is actually a duplicate since this filtering is also done before applying actions on + # the environment. So might want to somehow reuse that conversion. Maybe can hold this information in the info + # dictionary of the transition. + + output_action_filter = \ + list(self.output_filter.action_filters.values())[0] + continuous_actions = [] + for action in batch.actions(): + continuous_actions.append(output_action_filter.filter(action)) + batch._actions = np.array(continuous_actions).squeeze() + + return super().learn_from_batch(batch) + + def train(self): + return super().train() + + def choose_action(self, curr_state): + if not isinstance(self.spaces.action, DiscreteActionSpace): + raise ValueError("WolpertingerAgent works only for discrete control problems") + + # convert to batch so we can run it through the network + tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor') + actor_network = self.networks['actor'].online_network + critic_network = self.networks['critic'].online_network + proto_action = actor_network.predict(tf_input_state) + proto_action = np.expand_dims(self.exploration_policy.get_action(proto_action), 0) + + nn_action_embeddings, indices, _, _ = self.knn_tree.query(keys=proto_action, k=self.ap.algorithm.k) + + # now move the actions through the critic and choose the one with the highest q value + critic_inputs = copy.copy(tf_input_state) + critic_inputs['observation'] = np.tile(critic_inputs['observation'], (self.ap.algorithm.k, 1)) + critic_inputs['action'] = nn_action_embeddings[0] + q_values = critic_network.predict(critic_inputs)[0] + action = int(indices[0][np.argmax(q_values)]) + self.action_signal.add_sample(action) + return ActionInfo(action=action, action_value=0) + + def init_environment_dependent_modules(self): + super().init_environment_dependent_modules() + self.knn_tree = self.get_initialized_knn() + + # TODO - ideally the knn should not be defined here, but somehow be defined by the user in the preset + def get_initialized_knn(self): + num_actions = len(self.spaces.action.actions) + action_max_abs_range = self.spaces.action.filtered_action_space.max_abs_range if \ + (hasattr(self.spaces.action, 'filtered_action_space') and + isinstance(self.spaces.action.filtered_action_space, BoxActionSpace)) \ + else 1.0 + keys = np.expand_dims((np.arange(num_actions) / (num_actions - 1) - 0.5) * 2, 1) * action_max_abs_range + values = np.expand_dims(np.arange(num_actions), 1) + knn_tree = AnnoyDictionary(dict_size=num_actions, key_width=self.ap.algorithm.action_embedding_width) + knn_tree.add(keys, values, force_rebuild_tree=True) + + return knn_tree + diff --git a/rl_coach/architectures/head_parameters.py b/rl_coach/architectures/head_parameters.py index ee607dd..207ea3e 100644 --- a/rl_coach/architectures/head_parameters.py +++ b/rl_coach/architectures/head_parameters.py @@ -108,6 +108,17 @@ class DDPGActorHeadParameters(HeadParameters): self.batchnorm = batchnorm +class WolpertingerActorHeadParameters(HeadParameters): + def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True, + num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0, + loss_weight: float = 1.0, dense_layer=None): + super().__init__(parameterized_class_name="WolpertingerActorHead", activation_function=activation_function, name=name, + dense_layer=dense_layer, num_output_head_copies=num_output_head_copies, + rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor, + loss_weight=loss_weight) + self.batchnorm = batchnorm + + class DNDQHeadParameters(HeadParameters): def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params', num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0, diff --git a/rl_coach/architectures/tensorflow_components/heads/__init__.py b/rl_coach/architectures/tensorflow_components/heads/__init__.py index 03c237a..0a83399 100644 --- a/rl_coach/architectures/tensorflow_components/heads/__init__.py +++ b/rl_coach/architectures/tensorflow_components/heads/__init__.py @@ -18,6 +18,7 @@ from .classification_head import ClassificationHead from .cil_head import RegressionHead from .td3_v_head import TD3VHead from .ddpg_v_head import DDPGVHead +from .wolpertinger_actor_head import WolpertingerActorHead __all__ = [ 'CategoricalQHead', @@ -38,6 +39,7 @@ __all__ = [ 'SACQHead', 'ClassificationHead', 'RegressionHead', - 'TD3VHead' - 'DDPGVHead' + 'TD3VHead', + 'DDPGVHead', + 'WolpertingerActorHead' ] diff --git a/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py b/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py new file mode 100644 index 0000000..3521a95 --- /dev/null +++ b/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2019 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import tensorflow as tf + +from rl_coach.architectures.tensorflow_components.layers import batchnorm_activation_dropout, Dense +from rl_coach.architectures.tensorflow_components.heads.head import Head +from rl_coach.base_parameters import AgentParameters +from rl_coach.core_types import Embedding +from rl_coach.spaces import SpacesDefinition, BoxActionSpace + + +class WolpertingerActorHead(Head): + def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str, + head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh', + batchnorm: bool=True, dense_layer=Dense, is_training=False): + super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function, + dense_layer=dense_layer, is_training=is_training) + self.name = 'wolpertinger_actor_head' + self.return_type = Embedding + self.action_embedding_width = agent_parameters.algorithm.action_embedding_width + self.batchnorm = batchnorm + self.output_scale = self.spaces.action.filtered_action_space.max_abs_range if \ + (hasattr(self.spaces.action, 'filtered_action_space') and + isinstance(self.spaces.action.filtered_action_space, BoxActionSpace)) \ + else None + + def _build_module(self, input_layer): + # mean + pre_activation_policy_value = self.dense_layer(self.action_embedding_width)(input_layer, + name='actor_action_embedding') + self.proto_action = batchnorm_activation_dropout(input_layer=pre_activation_policy_value, + batchnorm=self.batchnorm, + activation_function=self.activation_function, + dropout_rate=0, + is_training=self.is_training, + name="BatchnormActivationDropout_0")[-1] + if self.output_scale is not None: + self.proto_action = tf.multiply(self.proto_action, self.output_scale, name='proto_action') + + self.output = [self.proto_action] + + def __str__(self): + result = [ + 'Dense (num outputs = {})'.format(self.action_embedding_width) + ] + return '\n'.join(result) diff --git a/rl_coach/exploration_policies/additive_noise.py b/rl_coach/exploration_policies/additive_noise.py index 8194718..8b67c7d 100644 --- a/rl_coach/exploration_policies/additive_noise.py +++ b/rl_coach/exploration_policies/additive_noise.py @@ -62,7 +62,9 @@ class AdditiveNoise(ContinuousActionExplorationPolicy): self.evaluation_noise = evaluation_noise self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space - if not isinstance(action_space, BoxActionSpace): + if not isinstance(action_space, BoxActionSpace) and \ + (hasattr(action_space, 'filtered_action_space') and not + isinstance(action_space.filtered_action_space, BoxActionSpace)): raise ValueError("Additive noise exploration works only for continuous controls." "The given action space is of type: {}".format(action_space.__class__.__name__)) diff --git a/rl_coach/exploration_policies/exploration_policy.py b/rl_coach/exploration_policies/exploration_policy.py index a345895..688fcce 100644 --- a/rl_coach/exploration_policies/exploration_policy.py +++ b/rl_coach/exploration_policies/exploration_policy.py @@ -115,5 +115,8 @@ class ContinuousActionExplorationPolicy(ExplorationPolicy): """ :param action_space: the action space used by the environment """ - assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace) + assert isinstance(action_space, BoxActionSpace) or \ + (hasattr(action_space, 'filtered_action_space') and + isinstance(action_space.filtered_action_space, BoxActionSpace)) or \ + isinstance(action_space, GoalsSpace) super().__init__(action_space) diff --git a/rl_coach/filters/action/partial_discrete_action_space_map.py b/rl_coach/filters/action/partial_discrete_action_space_map.py index 2322698..ad6e105 100644 --- a/rl_coach/filters/action/partial_discrete_action_space_map.py +++ b/rl_coach/filters/action/partial_discrete_action_space_map.py @@ -48,7 +48,8 @@ class PartialDiscreteActionSpaceMap(ActionFilter): def get_unfiltered_action_space(self, output_action_space: ActionSpace) -> DiscreteActionSpace: self.output_action_space = output_action_space - self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions) + self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions, + filtered_action_space=output_action_space) return self.input_action_space def filter(self, action: ActionType) -> ActionType: diff --git a/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py b/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py index 3368ee8..8633118 100644 --- a/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py +++ b/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py @@ -57,7 +57,7 @@ class AnnoyDictionary(object): self.built_capacity = 0 - def add(self, keys, values, additional_data=None): + def add(self, keys, values, additional_data=None, force_rebuild_tree=False): if not additional_data: additional_data = [None] * len(keys) @@ -96,7 +96,7 @@ class AnnoyDictionary(object): if len(self.buffered_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, int(self.curr_size * 0.02)) self._rebuild_index() - elif self.rebuild_on_every_update: + elif force_rebuild_tree or self.rebuild_on_every_update: self._rebuild_index() self.current_timestamp += 1 diff --git a/rl_coach/presets/Mujoco_Wolpertinger.py b/rl_coach/presets/Mujoco_Wolpertinger.py new file mode 100644 index 0000000..f12e41c --- /dev/null +++ b/rl_coach/presets/Mujoco_Wolpertinger.py @@ -0,0 +1,57 @@ +from collections import OrderedDict + +from rl_coach.architectures.layers import Dense +from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme +from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps +from rl_coach.environments.environment import SingleLevelSelection +from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2 +from rl_coach.filters.action import BoxDiscretization +from rl_coach.filters.filter import OutputFilter +from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager +from rl_coach.graph_managers.graph_manager import ScheduleParameters +from rl_coach.agents.wolpertinger_agent import WolpertingerAgentParameters + +#################### +# Graph Scheduling # +#################### +schedule_params = ScheduleParameters() +schedule_params.improve_steps = EnvironmentSteps(2000000) +schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) +schedule_params.evaluation_steps = EnvironmentEpisodes(1) +schedule_params.heatup_steps = EnvironmentSteps(3000) + +######### +# Agent # +######### +agent_params = WolpertingerAgentParameters() +agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)] +agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)] +agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense(400)] +agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(300)] +agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty +agent_params.output_filter = \ + OutputFilter( + action_filters=OrderedDict([ + ('discretization', BoxDiscretization(num_bins_per_dimension=int(1e6))) + ]), + is_a_reference_filter=False + ) + +############### +# Environment # +############### +env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2)) + +######## +# Test # +######## +preset_validation_params = PresetValidationParameters() +preset_validation_params.test = True +preset_validation_params.min_reward_threshold = 500 +preset_validation_params.max_episodes_to_achieve_reward = 1000 +preset_validation_params.reward_test_level = 'inverted_pendulum' +preset_validation_params.trace_test_levels = ['inverted_pendulum'] + +graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, + schedule_params=schedule_params, vis_params=VisualizationParameters(), + preset_validation_params=preset_validation_params) diff --git a/rl_coach/spaces.py b/rl_coach/spaces.py index 503598c..5dcaa2b 100644 --- a/rl_coach/spaces.py +++ b/rl_coach/spaces.py @@ -385,7 +385,8 @@ class DiscreteActionSpace(ActionSpace): """ A discrete action space with action indices as actions """ - def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None): + def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None, + filtered_action_space=None): super().__init__(1, low=0, high=num_actions-1, descriptions=descriptions) # the number of actions is mapped to high @@ -395,6 +396,9 @@ class DiscreteActionSpace(ActionSpace): else: self.default_action = default_action + if filtered_action_space is not None: + self.filtered_action_space = filtered_action_space + @property def actions(self) -> List[ActionType]: return list(range(0, int(self.high[0]) + 1))