diff --git a/docs/_modules/rl_coach/agents/agent.html b/docs/_modules/rl_coach/agents/agent.html
index 49d4a8a..0f566aa 100644
--- a/docs/_modules/rl_coach/agents/agent.html
+++ b/docs/_modules/rl_coach/agents/agent.html
@@ -756,6 +756,9 @@
ifself.phase!=RunPhase.TEST:ifisinstance(self.memory,EpisodicExperienceReplay):
+ ifself.ap.algorithm.override_episode_rewards_with_the_last_transition_reward:
+ fortinself.current_episode_buffer.transitions:
+ t.reward=self.current_episode_buffer.transitions[-1].rewardself.call_memory('store_episode',self.current_episode_buffer)elifself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:fortransitioninself.current_episode_buffer.transitions:
@@ -910,7 +913,8 @@
# update countersself.training_iteration+=1ifself.pre_network_filterisnotNone:
- batch=self.pre_network_filter.filter(batch,update_internal_state=False,deep_copy=False)
+ update_internal_state=self.ap.algorithm.update_pre_network_filters_state_on_train
+ batch=self.pre_network_filter.filter(batch,update_internal_state=update_internal_state,deep_copy=False)# if the batch returned empty then there are not enough samples in the replay buffer -> skip# training step
@@ -1020,7 +1024,8 @@
# informed actionifself.pre_network_filterisnotNone:# before choosing an action, first use the pre_network_filter to filter out the current state
- update_filter_internal_state=self.phaseisnotRunPhase.TEST
+ update_filter_internal_state=self.ap.algorithm.update_pre_network_filters_state_on_inferenceand \
+ self.phaseisnotRunPhase.TESTcurr_state=self.run_pre_network_filter_for_inference(self.curr_state,update_filter_internal_state)else:
@@ -1048,6 +1053,10 @@
:return: The filtered state """dummy_env_response=EnvResponse(next_state=state,reward=0,game_over=False)
+
+ # TODO actually we only want to run the observation filters. No point in running the reward filters as the
+ # filtered reward is being ignored anyway (and it might unncecessarily affect the reward filters' internal
+ # state).returnself.pre_network_filter.filter(dummy_env_response,update_internal_state=update_filter_internal_state)[0].next_state
@@ -1177,7 +1186,7 @@
""" Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent has another master agent that is controlling it. In such cases, the master agent can define the goals for the
- slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
+ slave agent, define its observation, possible actions, etc. The directive type is defined by the agent in-action-space. :param action: The action that should be set as the directive
diff --git a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
index 9a3f3b1..ba808c2 100644
--- a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
+++ b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
@@ -295,7 +295,9 @@
self.optimization_epochs=10self.normalization_stats=Noneself.clipping_decay_schedule=ConstantSchedule(1)
- self.act_for_full_episodes=True
+ self.act_for_full_episodes=True
+ self.update_pre_network_filters_state_on_train=True
+ self.update_pre_network_filters_state_on_inference=FalseclassClippedPPOAgentParameters(AgentParameters):
@@ -486,7 +488,9 @@
network.set_is_training(True)dataset=self.memory.transitions
- dataset=self.pre_network_filter.filter(dataset,deep_copy=False)
+ update_internal_state=self.ap.algorithm.update_pre_network_filters_state_on_train
+ dataset=self.pre_network_filter.filter(dataset,deep_copy=False,
+ update_internal_state=update_internal_state)batch=Batch(dataset)fortraining_stepinrange(self.ap.algorithm.num_consecutive_training_steps):
@@ -512,7 +516,9 @@
defrun_pre_network_filter_for_inference(self,state:StateType,update_internal_state:bool=False):dummy_env_response=EnvResponse(next_state=state,reward=0,game_over=False)
- returnself.pre_network_filter.filter(dummy_env_response,update_internal_state=False)[0].next_state
+ update_internal_state=self.ap.algorithm.update_pre_network_filters_state_on_inference
+ returnself.pre_network_filter.filter(
+ dummy_env_response,update_internal_state=update_internal_state)[0].next_statedefchoose_action(self,curr_state):self.ap.algorithm.clipping_decay_schedule.step()
diff --git a/docs/_modules/rl_coach/agents/wolpertinger_agent.html b/docs/_modules/rl_coach/agents/wolpertinger_agent.html
new file mode 100644
index 0000000..67dd6cd
--- /dev/null
+++ b/docs/_modules/rl_coach/agents/wolpertinger_agent.html
@@ -0,0 +1,356 @@
+
+
+
+
+
+
+
+
+
+
+ rl_coach.agents.wolpertinger_agent — Reinforcement Learning Coach 0.12.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Source code for rl_coach.agents.wolpertinger_agent
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+importcopy
+fromtypingimportUnion
+fromcollectionsimportOrderedDict
+importnumpyasnp
+
+fromrl_coach.agents.ddpg_agentimportDDPGAlgorithmParameters,DDPGActorNetworkParameters, \
+ DDPGCriticNetworkParameters,DDPGAgent
+fromrl_coach.base_parametersimportAgentParameters
+fromrl_coach.core_typesimportActionInfo
+fromrl_coach.exploration_policies.additive_noiseimportAdditiveNoiseParameters
+fromrl_coach.memories.episodic.episodic_experience_replayimportEpisodicExperienceReplayParameters
+fromrl_coach.memories.non_episodic.differentiable_neural_dictionaryimportAnnoyDictionary
+fromrl_coach.spacesimportDiscreteActionSpace,BoxActionSpace
+fromrl_coach.architectures.head_parametersimportWolpertingerActorHeadParameters
+
+
+classWolpertingerCriticNetworkParameters(DDPGCriticNetworkParameters):
+ def__init__(self,use_batchnorm=False):
+ super().__init__(use_batchnorm=use_batchnorm)
+
+
+classWolpertingerActorNetworkParameters(DDPGActorNetworkParameters):
+ def__init__(self,use_batchnorm=False):
+ super().__init__()
+ self.heads_parameters=[WolpertingerActorHeadParameters(batchnorm=use_batchnorm)]
+
+
+
+
+
+classWolpertingerAgentParameters(AgentParameters):
+ def__init__(self,use_batchnorm=False):
+ exploration_params=AdditiveNoiseParameters()
+ exploration_params.noise_as_percentage_from_action_space=False
+
+ super().__init__(algorithm=WolpertingerAlgorithmParameters(),
+ exploration=exploration_params,
+ memory=EpisodicExperienceReplayParameters(),
+ networks=OrderedDict(
+ [("actor",WolpertingerActorNetworkParameters(use_batchnorm=use_batchnorm)),
+ ("critic",WolpertingerCriticNetworkParameters(use_batchnorm=use_batchnorm))]))
+
+ @property
+ defpath(self):
+ return'rl_coach.agents.wolpertinger_agent:WolpertingerAgent'
+
+
+# Deep Reinforcement Learning in Large Discrete Action Spaces - https://arxiv.org/pdf/1512.07679.pdf
+classWolpertingerAgent(DDPGAgent):
+ def__init__(self,agent_parameters,parent:Union['LevelManager','CompositeAgent']=None):
+ super().__init__(agent_parameters,parent)
+
+ deflearn_from_batch(self,batch):
+ # replay buffer holds the actions in the discrete manner, as the agent is expected to act with discrete actions
+ # with the BoxDiscretization output filter. But DDPG needs to work on continuous actions, thus converting to
+ # continuous actions. This is actually a duplicate since this filtering is also done before applying actions on
+ # the environment. So might want to somehow reuse that conversion. Maybe can hold this information in the info
+ # dictionary of the transition.
+
+ output_action_filter= \
+ list(self.output_filter.action_filters.values())[0]
+ continuous_actions=[]
+ foractioninbatch.actions():
+ continuous_actions.append(output_action_filter.filter(action))
+ batch._actions=np.array(continuous_actions).squeeze()
+
+ returnsuper().learn_from_batch(batch)
+
+ deftrain(self):
+ returnsuper().train()
+
+ defchoose_action(self,curr_state):
+ ifnotisinstance(self.spaces.action,DiscreteActionSpace):
+ raiseValueError("WolpertingerAgent works only for discrete control problems")
+
+ # convert to batch so we can run it through the network
+ tf_input_state=self.prepare_batch_for_inference(curr_state,'actor')
+ actor_network=self.networks['actor'].online_network
+ critic_network=self.networks['critic'].online_network
+ proto_action=actor_network.predict(tf_input_state)
+ proto_action=np.expand_dims(self.exploration_policy.get_action(proto_action),0)
+
+ nn_action_embeddings,indices,_,_=self.knn_tree.query(keys=proto_action,k=self.ap.algorithm.k)
+
+ # now move the actions through the critic and choose the one with the highest q value
+ critic_inputs=copy.copy(tf_input_state)
+ critic_inputs['observation']=np.tile(critic_inputs['observation'],(self.ap.algorithm.k,1))
+ critic_inputs['action']=nn_action_embeddings[0]
+ q_values=critic_network.predict(critic_inputs)[0]
+ action=int(indices[0][np.argmax(q_values)])
+ self.action_signal.add_sample(action)
+ returnActionInfo(action=action,action_value=0)
+
+ definit_environment_dependent_modules(self):
+ super().init_environment_dependent_modules()
+ self.knn_tree=self.get_initialized_knn()
+
+ # TODO - ideally the knn should not be defined here, but somehow be defined by the user in the preset
+ defget_initialized_knn(self):
+ num_actions=len(self.spaces.action.actions)
+ action_max_abs_range=self.spaces.action.filtered_action_space.max_abs_rangeif \
+ (hasattr(self.spaces.action,'filtered_action_space')and
+ isinstance(self.spaces.action.filtered_action_space,BoxActionSpace)) \
+ else1.0
+ keys=np.expand_dims((np.arange(num_actions)/(num_actions-1)-0.5)*2,1)*action_max_abs_range
+ values=np.expand_dims(np.arange(num_actions),1)
+ knn_tree=AnnoyDictionary(dict_size=num_actions,key_width=self.ap.algorithm.action_embedding_width)
+ knn_tree.add(keys,values,force_rebuild_tree=True)
+
+ returnknn_tree
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/_modules/rl_coach/base_parameters.html b/docs/_modules/rl_coach/base_parameters.html
index 60aac7f..045363d 100644
--- a/docs/_modules/rl_coach/base_parameters.html
+++ b/docs/_modules/rl_coach/base_parameters.html
@@ -396,6 +396,14 @@
# Support for parameter noiseself.supports_parameter_noise=False
+ # Override, in retrospective, all the episode rewards with the last reward in the episode
+ # (sometimes useful for sparse, end of the episode, rewards problems)
+ self.override_episode_rewards_with_the_last_transition_reward=False
+
+ # Filters - TODO consider creating a FilterParameters class and initialize the filters with it
+ self.update_pre_network_filters_state_on_train=False
+ self.update_pre_network_filters_state_on_inference=True
+
[docs]classNFSDataStore(CheckpointDataStore):""" An implementation of data store which uses NFS for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker.
diff --git a/docs/_modules/rl_coach/data_stores/s3_data_store.html b/docs/_modules/rl_coach/data_stores/s3_data_store.html
index 64fc0b9..c2e4b9b 100644
--- a/docs/_modules/rl_coach/data_stores/s3_data_store.html
+++ b/docs/_modules/rl_coach/data_stores/s3_data_store.html
@@ -198,7 +198,8 @@
#
-fromrl_coach.data_stores.data_storeimportDataStore,DataStoreParameters
+fromrl_coach.data_stores.data_storeimportDataStoreParameters
+fromrl_coach.data_stores.checkpoint_data_storeimportCheckpointDataStorefromminioimportMiniofromminio.errorimportResponseErrorfromconfigparserimportConfigParser,Error
@@ -222,7 +223,7 @@
self.expt_dir=expt_dir
-
[docs]classS3DataStore(CheckpointDataStore):""" An implementation of the data store using S3 for storing policy checkpoints when using Coach in distributed mode. The policy checkpoints are written by the trainer and read by the rollout worker.
diff --git a/docs/_modules/rl_coach/exploration_policies/additive_noise.html b/docs/_modules/rl_coach/exploration_policies/additive_noise.html
index 44e2dc3..92eb352 100644
--- a/docs/_modules/rl_coach/exploration_policies/additive_noise.html
+++ b/docs/_modules/rl_coach/exploration_policies/additive_noise.html
@@ -245,7 +245,9 @@
self.evaluation_noise=evaluation_noiseself.noise_as_percentage_from_action_space=noise_as_percentage_from_action_space
- ifnotisinstance(action_space,BoxActionSpace):
+ ifnotisinstance(action_space,BoxActionSpace)and \
+ (hasattr(action_space,'filtered_action_space')andnot
+ isinstance(action_space.filtered_action_space,BoxActionSpace)):raiseValueError("Additive noise exploration works only for continuous controls.""The given action space is of type: {}".format(action_space.__class__.__name__))
diff --git a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html
index f5ffd7d..faa4583 100644
--- a/docs/_modules/rl_coach/exploration_policies/exploration_policy.html
+++ b/docs/_modules/rl_coach/exploration_policies/exploration_policy.html
@@ -298,7 +298,10 @@
""" :param action_space: the action space used by the environment """
- assertisinstance(action_space,BoxActionSpace)orisinstance(action_space,GoalsSpace)
+ assertisinstance(action_space,BoxActionSpace)or \
+ (hasattr(action_space,'filtered_action_space')and
+ isinstance(action_space.filtered_action_space,BoxActionSpace))or \
+ isinstance(action_space,GoalsSpace)super().__init__(action_space)
diff --git a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html
index 7d33198..56d84e7 100644
--- a/docs/_modules/rl_coach/exploration_policies/truncated_normal.html
+++ b/docs/_modules/rl_coach/exploration_policies/truncated_normal.html
@@ -271,9 +271,6 @@
else:action_values_std=current_noise
- # scale the noise to the action space range
- action_values_std=current_noise*(self.action_space.high-self.action_space.low)
-
# extract the mean valuesifisinstance(action_values,list):# the action values are expected to be a list with the action mean and optionally the action stdev
diff --git a/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html b/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html
index 0fae9ce..4e4837b 100644
--- a/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html
+++ b/docs/_modules/rl_coach/filters/action/partial_discrete_action_space_map.html
@@ -231,7 +231,8 @@
defget_unfiltered_action_space(self,output_action_space:ActionSpace)->DiscreteActionSpace:self.output_action_space=output_action_space
- self.input_action_space=DiscreteActionSpace(len(self.target_actions),self.descriptions)
+ self.input_action_space=DiscreteActionSpace(len(self.target_actions),self.descriptions,
+ filtered_action_space=output_action_space)returnself.input_action_spacedeffilter(self,action:ActionType)->ActionType:
diff --git a/docs/_modules/rl_coach/memories/backend/redis.html b/docs/_modules/rl_coach/memories/backend/redis.html
index c904f4d..7567160 100644
--- a/docs/_modules/rl_coach/memories/backend/redis.html
+++ b/docs/_modules/rl_coach/memories/backend/redis.html
@@ -261,11 +261,18 @@
"""if'namespace'notinself.params.orchestrator_params:self.params.orchestrator_params['namespace']="default"
- fromkubernetesimportclient
+ fromkubernetesimportclient,configcontainer=client.V1Container(name=self.redis_server_name,image='redis:4-alpine',
+ resources=client.V1ResourceRequirements(
+ limits={
+ "cpu":"8",
+ "memory":"4Gi"
+ # "nvidia.com/gpu": "0",
+ }
+ ),)template=client.V1PodTemplateSpec(metadata=client.V1ObjectMeta(labels={'app':self.redis_server_name}),
@@ -288,8 +295,10 @@
spec=deployment_spec)
+ config.load_kube_config()api_client=client.AppsV1Api()try:
+ print(self.params.orchestrator_params)api_client.create_namespaced_deployment(self.params.orchestrator_params['namespace'],deployment)exceptclient.rest.ApiExceptionase:print("Got exception: %s\n while creating redis-server",e)
diff --git a/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html b/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html
index 2a5d1f9..063e0e4 100644
--- a/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html
+++ b/docs/_modules/rl_coach/memories/non_episodic/differentiable_neural_dictionary.html
@@ -240,7 +240,7 @@
self.built_capacity=0
- defadd(self,keys,values,additional_data=None):
+ defadd(self,keys,values,additional_data=None,force_rebuild_tree=False):ifnotadditional_data:additional_data=[None]*len(keys)
@@ -279,7 +279,7 @@
iflen(self.buffered_indices)>=self.min_update_size:self.min_update_size=max(self.initial_update_size,int(self.curr_size*0.02))self._rebuild_index()
- elifself.rebuild_on_every_update:
+ elifforce_rebuild_treeorself.rebuild_on_every_update:self._rebuild_index()self.current_timestamp+=1
diff --git a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
index b8c99db..71d144f 100644
--- a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
+++ b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
@@ -307,6 +307,11 @@
"""self.memory_backend.deploy()
+
+ ifself.params.data_store_params.store_type=="redis":
+ self.data_store.params.redis_address=self.memory_backend.params.redis_address
+ self.data_store.params.redis_port=self.memory_backend.params.redis_port
+
ifnotself.data_store.deploy():returnFalseifself.params.data_store_params.store_type=="nfs":
@@ -329,6 +334,8 @@
trainer_params.command+=['--data_store_params',json.dumps(self.params.data_store_params.__dict__)]name="{}-{}".format(trainer_params.run_type,uuid.uuid4())
+ # TODO: instead of defining each container and template spec from scratch, loaded default
+ # configuration and modify them as necessary depending on the store typeifself.params.data_store_params.store_type=="nfs":container=k8sclient.V1Container(name=name,
@@ -354,7 +361,7 @@
restart_policy='Never'),)
- else:
+ elifself.params.data_store_params.store_type=="s3":container=k8sclient.V1Container(name=name,image=trainer_params.image,
@@ -373,6 +380,34 @@
restart_policy='Never'),)
+ elifself.params.data_store_params.store_type=="redis":
+ container=k8sclient.V1Container(
+ name=name,
+ image=trainer_params.image,
+ command=trainer_params.command,
+ args=trainer_params.arguments,
+ image_pull_policy='Always',
+ stdin=True,
+ tty=True,
+ resources=k8sclient.V1ResourceRequirements(
+ limits={
+ "cpu":"40",
+ "memory":"4Gi",
+ "nvidia.com/gpu":"1",
+ }
+ ),
+ )
+ template=k8sclient.V1PodTemplateSpec(
+ metadata=k8sclient.V1ObjectMeta(labels={'app':name}),
+ spec=k8sclient.V1PodSpec(
+ containers=[container],
+ restart_policy='Never'
+ ),
+ )
+ else:
+ raiseValueError("unexpected store_type {}. expected 's3', 'nfs', 'redis'".format(
+ self.params.data_store_params.store_type
+ ))job_spec=k8sclient.V1JobSpec(completions=1,
@@ -404,12 +439,17 @@
ifnotworker_params:returnFalse
+ # At this point, the memory backend and data store have been deployed and in the process,
+ # these parameters have been updated to include things like the hostname and port the
+ # service can be found at.worker_params.command+=['--memory_backend_params',json.dumps(self.params.memory_backend_parameters.__dict__)]worker_params.command+=['--data_store_params',json.dumps(self.params.data_store_params.__dict__)]worker_params.command+=['--num_workers','{}'.format(worker_params.num_replicas)]name="{}-{}".format(worker_params.run_type,uuid.uuid4())
+ # TODO: instead of defining each container and template spec from scratch, loaded default
+ # configuration and modify them as necessary depending on the store typeifself.params.data_store_params.store_type=="nfs":container=k8sclient.V1Container(name=name,
@@ -435,7 +475,7 @@
restart_policy='Never'),)
- else:
+ elifself.params.data_store_params.store_type=="s3":container=k8sclient.V1Container(name=name,image=worker_params.image,
@@ -454,6 +494,32 @@
restart_policy='Never'))
+ elifself.params.data_store_params.store_type=="redis":
+ container=k8sclient.V1Container(
+ name=name,
+ image=worker_params.image,
+ command=worker_params.command,
+ args=worker_params.arguments,
+ image_pull_policy='Always',
+ stdin=True,
+ tty=True,
+ resources=k8sclient.V1ResourceRequirements(
+ limits={
+ "cpu":"8",
+ "memory":"4Gi",
+ # "nvidia.com/gpu": "0",
+ }
+ ),
+ )
+ template=k8sclient.V1PodTemplateSpec(
+ metadata=k8sclient.V1ObjectMeta(labels={'app':name}),
+ spec=k8sclient.V1PodSpec(
+ containers=[container],
+ restart_policy='Never'
+ )
+ )
+ else:
+ raiseValueError('unexpected store type {}'.format(self.params.data_store_params.store_type))job_spec=k8sclient.V1JobSpec(completions=worker_params.num_replicas,
diff --git a/docs/_modules/rl_coach/spaces.html b/docs/_modules/rl_coach/spaces.html
index 2e890d9..30472c3 100644
--- a/docs/_modules/rl_coach/spaces.html
+++ b/docs/_modules/rl_coach/spaces.html
@@ -568,7 +568,8 @@
""" A discrete action space with action indices as actions """
- def__init__(self,num_actions:int,descriptions:Union[None,List,Dict]=None,default_action:np.ndarray=None):
+ def__init__(self,num_actions:int,descriptions:Union[None,List,Dict]=None,default_action:np.ndarray=None,
+ filtered_action_space=None):super().__init__(1,low=0,high=num_actions-1,descriptions=descriptions)# the number of actions is mapped to high
@@ -578,6 +579,9 @@
else:self.default_action=default_action
+ iffiltered_action_spaceisnotNone:
+ self.filtered_action_space=filtered_action_space
+
@propertydefactions(self)->List[ActionType]:returnlist(range(0,int(self.high[0])+1))
diff --git a/docs/_sources/components/agents/index.rst.txt b/docs/_sources/components/agents/index.rst.txt
index ca21713..c958768 100644
--- a/docs/_sources/components/agents/index.rst.txt
+++ b/docs/_sources/components/agents/index.rst.txt
@@ -21,8 +21,6 @@ A detailed description of those algorithms can be found by navigating to each of
imitation/cil
policy_optimization/cppo
policy_optimization/ddpg
- policy_optimization/td3
- policy_optimization/sac
other/dfp
value_optimization/double_dqn
value_optimization/dqn
@@ -36,6 +34,10 @@ A detailed description of those algorithms can be found by navigating to each of
policy_optimization/ppo
value_optimization/rainbow
value_optimization/qr_dqn
+ policy_optimization/sac
+ policy_optimization/td3
+ policy_optimization/wolpertinger
+
.. autoclass:: rl_coach.base_parameters.AgentParameters
diff --git a/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt b/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt
new file mode 100644
index 0000000..5aa57d2
--- /dev/null
+++ b/docs/_sources/components/agents/policy_optimization/wolpertinger.rst.txt
@@ -0,0 +1,56 @@
+Wolpertinger
+=============
+
+**Actions space:** Discrete
+
+**References:** `Deep Reinforcement Learning in Large Discrete Action Spaces `_
+
+Network Structure
+-----------------
+
+.. image:: /_static/img/design_imgs/wolpertinger.png
+ :align: center
+
+Algorithm Description
+---------------------
+Choosing an action
+++++++++++++++++++
+
+Pass the current states through the actor network, and get a proto action :math:`\mu`.
+While in training phase, use a continuous exploration policy, such as the a gaussian noise,
+to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid
+action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the
+critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen.
+When testing, the same flow is used, but no exploration noise is added.
+
+Training the network
+++++++++++++++++++++
+
+Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed
+through the k-NN tree. It is being passed directly to the critic.
+
+Start by sampling a batch of transitions from the experience replay.
+
+* To train the **critic network**, use the following targets:
+
+ :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))`
+
+ First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`.
+ Next, run the critic target network using the next states and :math:`\mu (s_{t+1} )`, and use the output to
+ calculate :math:`y_t` according to the equation above. To train the network, use the current states and actions
+ as the inputs, and :math:`y_t` as the targets.
+
+* To train the **actor network**, use the following equation:
+
+ :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]`
+
+ Use the actor's online network to get the action mean values using the current states as the inputs.
+ Then, use the critic online network in order to get the gradients of the critic output with respect to the
+ action mean values :math:`\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }`.
+ Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights,
+ given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network.
+
+After every training step, do a soft update of the critic and actor target networks' weights from the online networks.
+
+
+.. autoclass:: rl_coach.agents.wolpertinger_agent.WolpertingerAlgorithmParameters
\ No newline at end of file
diff --git a/docs/components/agents/index.html b/docs/components/agents/index.html
index 357caad..7c90e3f 100644
--- a/docs/components/agents/index.html
+++ b/docs/components/agents/index.html
@@ -117,8 +117,6 @@
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
@@ -652,7 +654,7 @@ dependent on those values, by calling init_environment_dependent_modules
set_incoming_directive(action: Union[int, float, numpy.ndarray, List]) → None[source]¶
Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
-slave agent, define it’s observation, possible actions, etc. The directive type is defined by the agent
+slave agent, define its observation, possible actions, etc. The directive type is defined by the agent
in-action-space.
Pass the current states through the actor network, and get a proto action \(\mu\).
+While in training phase, use a continuous exploration policy, such as the a gaussian noise,
+to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid
+action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the
+critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen.
+When testing, the same flow is used, but no exploration noise is added.
Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed
+through the k-NN tree. It is being passed directly to the critic.
+
Start by sampling a batch of transitions from the experience replay.
+
+
To train the critic network, use the following targets:
First run the actor target network, using the next states as the inputs, and get \(\mu (s_{t+1} )\).
+Next, run the critic target network using the next states and \(\mu (s_{t+1} )\), and use the output to
+calculate \(y_t\) according to the equation above. To train the network, use the current states and actions
+as the inputs, and \(y_t\) as the targets.
+
+
To train the actor network, use the following equation:
Use the actor’s online network to get the action mean values using the current states as the inputs.
+Then, use the critic online network in order to get the gradients of the critic output with respect to the
+action mean values \(\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }\).
+Using the chain rule, calculate the gradients of the actor’s output, with respect to the actor weights,
+given \(\nabla_a Q(s,a)\). Finally, apply those gradients to the actor network.
+
+
+
After every training step, do a soft update of the critic and actor target networks’ weights from the online networks.
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/components/spaces.html b/docs/components/spaces.html
index a62653c..753e334 100644
--- a/docs/components/spaces.html
+++ b/docs/components/spaces.html
@@ -442,7 +442,7 @@ The actions will be in the form:
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
diff --git a/docs_raw/source/_static/img/algorithms.png b/docs_raw/source/_static/img/algorithms.png
index 6c00f21..0849ad7 100644
Binary files a/docs_raw/source/_static/img/algorithms.png and b/docs_raw/source/_static/img/algorithms.png differ
diff --git a/docs_raw/source/_static/img/design_imgs/wolpertinger.png b/docs_raw/source/_static/img/design_imgs/wolpertinger.png
new file mode 100644
index 0000000..e7f9b37
Binary files /dev/null and b/docs_raw/source/_static/img/design_imgs/wolpertinger.png differ
diff --git a/docs_raw/source/algorithms.xml b/docs_raw/source/algorithms.xml
index e6f68c9..377709c 100644
--- a/docs_raw/source/algorithms.xml
+++ b/docs_raw/source/algorithms.xml
@@ -1 +1 @@
-7V1bk5s2FP41O9M+JIMQ18e9NduZbCdp0jR5ysgg26SAXMC73v76CnMVEphdkLGzbB5sDhJgne/7dCQdkQt4HezeRWizvicu9i9Uxd1dwJsLVQVAt+hHannKLIZuZ4ZV5Ll5ocrwyfsP50Ylt249F8dMwYQQP/E2rNEhYYidhLGhKCKPbLEl8dm7btAKc4ZPDvJ569+em6wzq6Urlf0Oe6t1cWeg5GcCVBTODfEaueSxZoK3F/A6IiTJvgW7a+ynjVe0S1bvt5az5YNFOEz6VPh+fW+gL8GXH+p7HP5156+j3dc3mp4/XPJU/GLs0gbID0mUrMmKhMi/raxXEdmGLk4vq9Cjqsx7QjbUCKjxB06Sp9ybaJsQalongZ+fxTsv+ZpWf6vnR99qZ252+ZX3B0/FQZhET7VK6eG3+rmq2v6oqBcnEfmn9F16j+wXpz+ztSVzU0y2kYM7mi8ttYckilY46SgItdLjlCqYBJg+I60YYR8l3gP7JCjH7Kosl1e9jCL0VCuwIV6YxLUrf0gNtEBOP6jl2MvJpxsNiDTLm1ZXefole4LiqPZTKtMeds+AYP6jH5C/zZuBg2QFuNR/j2svwZ82aO+ZRyo7LLiWnu9fE59E+7pwqaf/SiDUzhj7v7QGCZOaPftLaxSMTXGEIifHs8qhSi1R9YCjBO+6ccWjoKigs+4ChXQ8Vsqj5i5R1jXVgYXoiJBT894LnKNwzpj14Rn60FceBqrDIB+rHAHJIsbRA701CYdxkWEQT0DHwoulkE0NErsIW0uHYSEYiXRU9N7qDO0MnnW2zpNOlcY5wDX6zLn+nIM9OTe0Rx7kYyjo9AyfPu7Vgn5ZJfsmyQxp78Sgwfh3S4oTb+K9Py9pAaBvdtXJ4iq/h5ttWv02WGDXxVFxVfrU2YXZm1Fz7QHkUd9SF3Df9x6mvo4tV2Oob4zEfMVqMB9AnvpAhzz3LVnc146Fi3vPdX38SBtrRkQNETaDBxoBC/Cg8njQZOFBHzk4PgMfADYILrnG+EAQBJfkHd0JhgRSQhEpb24+vKMnL52EtrOq3GHkDmEndULScD/j8ZCEuOHa3IR8bxXSQ4c6jfYZ8Cp1qecg/zI/EezlowVvbCzCQG4EgABDacg2FNBUEyBEXsSm/ewR24jxmdkzPrOmjM/MVsr3ZbgtInirbiyiUgNwGGMxvZtF/9gG9EZkm9AQL87uGm6D78hJB24xX/6AltQLyutjXNNeKPwgQNDHLJfYcCQN+lRWQVRL0NGropkWXZaEzBOxQ0TF6ikq9pSiYo0fRwhV5jMK754VN7w+/utqM4aA8C2vAYZo3gfKkgB7loABEmD3lAAwaWBhcxrwlfe673ubuC26rvkOxZtszXPp7VIMXLE8HZ2IYzDPZId30OSHd1C4xCGLdXBm3QDWaT1ZZ05JOmBO6WNQ83Dl70M+ZjxcOXwKHwPjHKIr0DlNcziUskSh1GXbWOpPFK56DtM47L2aeRlz4nkZYxjvwSvXdtB3VJXNSh87u6UctOdw06wD2S2ws7yk7BZ+0JfNz1BbgNFrXF/XtIkX2IsLzylHKrDYcFyzBSlHotUWaEjzjtraj/deXVFEfblDQgclrRMjLx2EncGYy26k9QHeyaZglVsaAeHYOX9nsKxpaKwPBLGRcGkZyPOCiGg/uRcaCbGmIMMSiDIsAZCW8WHNMeqQbK++6V5Z8DdZvld7wtcJLSjOC4Rt05S6LVgcMARhkaz1QW0eyg6Sib7zlNCYVCba8/9mmThBmYAaO7SdXCaKCHKWiZfJhN5TJrRJp7qhKC10lomTlYlmNGEIZsCOKxPDtpi8epnouyCmK2JcHEkm2lfEZpk4QZloRhOTy4Q2RxPH2R0+dP/ni9bPnr07XIFd5eWsn2nGDMH5BQWTvqBAH7a7YxoIqqeDQa1vZqY+iQyqjSVQ07QPyGBneUkYPJtVgiNAaShCDkLAMhurPRl081qNoGqMTo7PCp4kUBcG5WXlkN9uBOZwvmux0WSFQrC7/Lij/mF7jKbpyMzT6ciKPJgT7chgMatTdkwHginV7CovqSMT5V7JkLrxZiAGPVeXaM4SCUEzOXB6kTybASevdSewu1vvm49hqFJCOQ2wotY3lHu22AJ2MsTSDoitAbrKSxJbCa8ZEqrcvRceELXXvuHUMliZsy3BdtNipuEo20314YMOAERg+DjOZYqrxNsSOPWes2amnyhIMREu4vQj+0UH8diZ/vvTpeFr7MyZJdh3KdztLG1vjioJfoEX/jIBBGkrKB8FFVRxhV9nfLKRoAobkaAIoeZREdqeZVG68fMNlWfly0X6EM1OrfH6pkWrL/n9gCM0KLAbgXUxyVTfjCdI+i/W/MZvzh6v0RK+le7YDWexEaQhyBHXRB21tA0xZ7P+cJIjkhzPR9rEKNh12Nhh1Xx35EgDkuZuyOJF9G0DEq3xXM8tf+TdlvSwelV+Vrz6Dwfg7f8=
\ No newline at end of file
+7V1bk5s2FP41O9M+ZAdJXB/31t3OZDtJk6bJUwaDbJMCcgHvevvrK4ywEYiL18jgLJsHm4MEWOf7Ph1JR+QC3QSb+8heLR+Ji/0LqLibC3R7ASFQoU4/UstLZjGgmhkWkeeyQnvDJ+8/zIwKs649F8dcwYQQP/FWvNEhYYidhLPZUUSe+WJz4vN3XdkLXDF8cmy/av3bc5NlZjU1ZW9/wN5imd8ZKOxMYOeFmSFe2i55LpjQ3QW6iQhJsm/B5gb7aePl7ZLV+63m7O7BIhwmXSp8v3nU7S/Blx/wPQ7/evCX0ebrO1VjD5e85L8Yu7QB2CGJkiVZkND27/bW64isQxenl1Xo0b7Me0JW1Aio8QdOkhfmTXudEGpaJoHPzuKNl3xNq19q7Ohb4czthl15e/CSH4RJ9FKolB5+K57bV9se5fXiJCL/7HyX3qPacKwtY7KOHNzQWmmpLQLtaIGThoKIgTxty8ItmGPuMQkwfUZaIMK+nXhPPNhshtnFrhyrehVF9kuhwIp4YRIXrvwhNdACjH5IZdhj5NP0EkTK5Q2zqTz9kj1BflT4KXvTFnYHQJD96CfbX7NmqEByD7jUf89LL8GfVvbWVc9UdnhwzT3fvyE+ibZ10VxL/+2AUDijb//SGiRMCvbsL62RMzbFkR05DM+wgirYhKonHCV404iC/KzGuwvk0vG8Vx7IXKIsC6qDctERIafgvVc4R6k4Y9KHen3oKg99q8NRPoYVApJZjKMnemsSHsdFjkFVAjomns2FbCqR2LWxOXc4FoKeSEdF71LjaKdXWWdpVdJBaZwDlUafONfGpVbO9d4jH+VjJOj0dJ8+//WMflkk2ybJDGnvxKFB/3dN8hPv4q0/r2gBoK02+5P5VX4PV+u0+l0ww66Lo/yq9KmzC/M3o+bCA8ijvglnaNv3tlNfw6arctTXe2K+YpaYD1CV+kBDVe6bsrivngoXj57r+viZtt+EiAIiLA4PNAIW4AFW8aDKwoPWc3B8Bj4AfBC84xrnA0EQvCNv707QJZASiUh5e/vhnp68chLazlB5wLZ7DDtpiycl93MeD0mIS65lJtv3FiE9dKgfaZ+BrlP/eY7tX7ETwVY+avDGxyIc5HoACNCVkmwjAU1VAULkRWzqzx6xvT4+MzrGZ+ao4jOjlvJdGW6JCF6rG7NopwE4jLGY3uWif6wDeiOyTmiIF2d3DdfBd9tJB25xtXyLlhQLyutjXMOaKdVBgKCPmc+x7kga9EFeQaAp6OihaKZFkyUh00TsAaJidhQVa1SiYvYfRwhV5rMdPhwUN7w9/muwHEMgdFnVAF0074NkSYA1SUB3CbA6SgAYV2BhVTTga9Xrvu+t4rrouuA7O15la55zb5Ni4Jrnae9E7IN5Bj+8Q0Z1eIeESxyyWIcm1nVnndqRdcaoSAeMIX0MCh7e+7vNx5yH9w4/gY+BfpbRFWicpmkPpUxRKHVVN5b60w4XHYdpFey9mXkZY+B5Gf043oO3pe2g66gqn4QeOLtlN2hncFPNluwW1FheUnZLddCXzc9QW4Dtt7i+rqoDL7DnF55SjiAw+XBctQQpR6LVFqRL8w6s7cc7r64oor7cIaFjJ7UTI68dhJ3BmMsqpfWBqpMNwSq3NAKivnP+zmBZU1d5HwhiI+HSMpDnBRHRfnIvlBJiDUGGJRBlWAIgLePDnGLUA7K9uqZ75bHeSAanqD7ha0QLitMCYd00pWYJFgd0QVgka31QnYayh8hE13lKpI9LJurz/yaZGKFMIJUf2g4uE3kEOclEJ5nQOsqEOq6pbiRKC51kYrQyUY4mdMEM2Gll4rgtJm9NJrouiOUTZWORifoVsUkmRigT5WhicJlQp2jiAJnIp8/aBx297/981frZwbvDFdRUXs76mapPEJxeUDDoCwq043Z3DANBOFyKWNfMTG0cMghLS6CGYbXIYGN5SRg8m1UCGVDqGSGtEDCN0mpPhmVWqxRU9dHJVbOCBwnUhUH5rnJY3W4EpnC+abHR4IVCsLv8tKP+4/YYDdORGYN1ZPlo/lw6MpTP6uw6ppZgChpN5SV1ZKLcKxlS198MxFHP1SSak0QiUE4OHF4kz2bAWdW60+/u1rrmY+jwNKGcCnhR6xrKHSy2gJ8MMdUWsdVBU3lJYivhNUNClXv0whZRe+sbTk2dlznLFGw3zWcaTrLdVDt+0AGACAwf+7lMfpV4vQNOsecsmOmnHaSYCGdx+pH9olY8Nqb//nRp+Co/c2YK9l0KdztL25sDJcEv8MJfBoAgbQXlo6ACFFf4dcInHwlCVIoERQg1TorQ+iyLnRs/31J5Vr5cpA9R7tRKr2+a1fqyuh+whwYFVimwzieZipvxBEn/+Zpf/83Z4TVawrfSnbrhTD6C1AU54qqoo5a2IeZs1h/GMCJh8B1qE6Ng12Fph1X53ZE9DUjKuyHzF9HXDUjU0nMdWv7Euy3p4f5V+Vnx/X84gO7+Bw==7Vxbt5o4FP415/F0kQQCPOq5tJ1e1plpV6edl1kRojIHiYNYtb++QYkQEi+nEPGcqg/KzgXI/vaXnZ0NV+hmsnydkun4AwtpfAWtcHmFbq8gBDbE/CeXrDYSF9obwSiNwqJSKfgU/aCF0Cqk8yikM6lixlicRVNZGLAkoUEmyUiasoVcbchi+axTMqKK4FNAYlX6dxRm443Uc6xS/oZGo7E4M7CKkgkRlQvBbExCtqiI0N0VukkZyzb/JssbGueDJ8Ylip0vDPcH797f/fEwsd8FD9eL601n909psr2FlCbZL3f9o2e/S1b35Os/N29Xt70VfvsQi66/k3hejFdxr9lKDGDK5klI807AFeovxlFGP01JkJcuOGS4bJxN4qJ4GMXxDYtZum6Lhk7+5fJZlrJHWinB60/egiVZRb755C3EaFv8gKRBgSy47UxoM5cUd0HTjC5raj8wZmCrSG4BlE1olq54O9GLgEkBfiCwsCihBHEhG1dghASKSAHf0bbvUkX8T6GlJ2hMVRANOeCLQ5ZmYzZiCYnvSmm/VGE+mGWd94xNC8X9R7NsVYwxmWdMVitdRtnXvPkrpzj6Vim5XRY9rw9W4iDht1tplB9+q5aVzdZHop2sXbDVbn6bv6BbPlRsngZ0z5CigpZIOqLZnnqOHispjUkWfZcvrnW9I8VS2WBG0+/81CxpZrSSqamWGnh0MNSaXc3aQ0K9YSCZKzBpnRC5rxzJPrFqnr6jWic0ZZz2xTjbNk7nSOPEXRqno5lGccwvtz/gf0bZepw2gny+kyCC/58zUXA9Wyu5xysAZ7osC0Uvb5PpPG9+NxnQMKSp6JVf9aZj+WRcXLkAcxzhwQFaz+aHOcKhXmhLHIFNUoTl1SgCIJUjgINUkvBMkQQ+FVg+RGEY0wUfwQtMDsHEl0ACoaMBCVRBYpsCiduyY/5cFQNkB3xrlZJiNA741sxb14xnwHyRznxvbx9e88JekPHBh9YbSsImdsw1k9UwIcEgYQmt6bsQkTgaJfww4JrkUw7q53qO+BK7VxRM1kSzA4SyfyPh0BRqALZqrI80Bm1rYGPMM/Rfumdo2g8URHDQEQSwS09QXKaGHo5lA19HBjs5ZpBu+YImM6qngnrVj/MJPxGbZ9ybnG3Omswn/5IgX0zO1PoHeKda0dwkFbr+wFIXIZpJajikODjlQhTKbAN9zSwFtWEiUzAEiiZeGN+cfiUK4LEMZHfKQLB9B0VLSZ9J8uZJDsmFLNbk4MC6c4LQK9U9wbrAFTIFGnThi9b5wj6WL/xO+cJW+OKrCoY4jqazXS5+RaVkNt3s4g2jZQ6NvmzTrRutMSt15YUnctUpHZ10RncuFtq2heJjDdTSY+VEBoq71Dyo6L1EwSHNS3ovYXA2mgfu8/DldGHArS932G/zdH5bb9cq7y+SjI5cQCqI/L2jS27H0SXcLLy0ZoP5ZCrqF5o5+/mCpFkvTxEqL3ktu4/y0VvXqWWX+L0bt39/Cn7xj+QX22vIL+umfBTIqlJhyqIkm1V6fsgFFe/GkwFse7XcoVp94FuN6mNrf/369dTq8z+bOywtZDtUDbZYmq15ujWaFwF+3K1f5SuT6zRlXDn8XMFvnVUj27rdcVIN1IXUm2jmhSUpQqvuj2AANAElBHQuiXBd2s+JcZ8xvZr0SUzT7zbN+mA+Y6c5U6LjBjtlwNKtbwKWBCTbGZn+1cjWMw1kIVAPN7sW1rCDq0mBMkfpajarycn2fNNbalFGz3I0qgEYqLrBxnj7spbshreRdyRv2ztAdaJE9N0JUO1mODwhn+HANmSj60rUXAlwyYfYT2ueX5tzPKjzSPU5EaaIzW7B4QA6hPzZTjf8l0xyBCSDWf6zuc6DG+x7vZvfYsWKgLxi9Xxft5+OT+jg2DoHp8ZvX66UpM6nZG8aM15QX0564pHPanjbVkfThoZG02229/lyXZKuInz2sa4KbOqqtBLexv7+8DOy/X31zYSfbROJ5JeHhtpgQNGPgAP0Na6LpxKgY2o6cZs9WHghwNYJ8Ngtjm4IcPvWhrMmQHX/5UKAZ0GACIP6jgJ0teu3U5Kg03y/9kSMp+HaswgwHclZdtN81FacNnAgZ8CGe+sbyhm4bGqd10TsHJvDiZom2jTjrstD32c3y0HhVYt9MREg7Oyhb+xd2OWs2EW8T8Z8Gl8z3KgZMo/XHz9yyeeUch6wuOIf59NmVt5R1kzeiQC0COAbf4DT0b0kRLdZYczZxeqE8TklURIlI0WNamy4zZxqWQG7bTEmAxr3SfA4WreXtp/yj0EmB9iurVecrp/3d9XNpjzv/qI9jfHhejKya0p3/LB8X+NmTVC+9RLd/QQ=
\ No newline at end of file
diff --git a/docs_raw/source/components/agents/index.rst b/docs_raw/source/components/agents/index.rst
index ca21713..c958768 100644
--- a/docs_raw/source/components/agents/index.rst
+++ b/docs_raw/source/components/agents/index.rst
@@ -21,8 +21,6 @@ A detailed description of those algorithms can be found by navigating to each of
imitation/cil
policy_optimization/cppo
policy_optimization/ddpg
- policy_optimization/td3
- policy_optimization/sac
other/dfp
value_optimization/double_dqn
value_optimization/dqn
@@ -36,6 +34,10 @@ A detailed description of those algorithms can be found by navigating to each of
policy_optimization/ppo
value_optimization/rainbow
value_optimization/qr_dqn
+ policy_optimization/sac
+ policy_optimization/td3
+ policy_optimization/wolpertinger
+
.. autoclass:: rl_coach.base_parameters.AgentParameters
diff --git a/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst b/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst
new file mode 100644
index 0000000..5aa57d2
--- /dev/null
+++ b/docs_raw/source/components/agents/policy_optimization/wolpertinger.rst
@@ -0,0 +1,56 @@
+Wolpertinger
+=============
+
+**Actions space:** Discrete
+
+**References:** `Deep Reinforcement Learning in Large Discrete Action Spaces `_
+
+Network Structure
+-----------------
+
+.. image:: /_static/img/design_imgs/wolpertinger.png
+ :align: center
+
+Algorithm Description
+---------------------
+Choosing an action
+++++++++++++++++++
+
+Pass the current states through the actor network, and get a proto action :math:`\mu`.
+While in training phase, use a continuous exploration policy, such as the a gaussian noise,
+to add exploration noise to the proto action. Then, pass the proto action to a k-NN tree to find actual valid
+action candidates, which are in the surrounding neighborhood of the proto action. Those actions are then passed to the
+critic to evaluate their goodness, and eventually the discrete index of the action with the highest Q value is chosen.
+When testing, the same flow is used, but no exploration noise is added.
+
+Training the network
+++++++++++++++++++++
+
+Training the network is exactly the same as in DDPG. Unlike when choosing the action, the proto action is not passed
+through the k-NN tree. It is being passed directly to the critic.
+
+Start by sampling a batch of transitions from the experience replay.
+
+* To train the **critic network**, use the following targets:
+
+ :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},\mu(s_{t+1} ))`
+
+ First run the actor target network, using the next states as the inputs, and get :math:`\mu (s_{t+1} )`.
+ Next, run the critic target network using the next states and :math:`\mu (s_{t+1} )`, and use the output to
+ calculate :math:`y_t` according to the equation above. To train the network, use the current states and actions
+ as the inputs, and :math:`y_t` as the targets.
+
+* To train the **actor network**, use the following equation:
+
+ :math:`\nabla_{\theta^\mu } J \approx E_{s_t \tilde{} \rho^\beta } [\nabla_a Q(s,a)|_{s=s_t,a=\mu (s_t ) } \cdot \nabla_{\theta^\mu} \mu(s)|_{s=s_t} ]`
+
+ Use the actor's online network to get the action mean values using the current states as the inputs.
+ Then, use the critic online network in order to get the gradients of the critic output with respect to the
+ action mean values :math:`\nabla _a Q(s,a)|_{s=s_t,a=\mu(s_t ) }`.
+ Using the chain rule, calculate the gradients of the actor's output, with respect to the actor weights,
+ given :math:`\nabla_a Q(s,a)`. Finally, apply those gradients to the actor network.
+
+After every training step, do a soft update of the critic and actor target networks' weights from the online networks.
+
+
+.. autoclass:: rl_coach.agents.wolpertinger_agent.WolpertingerAlgorithmParameters
\ No newline at end of file
diff --git a/docs_raw/source/diagrams.xml b/docs_raw/source/diagrams.xml
index 15f067f..9b5e64a 100644
--- a/docs_raw/source/diagrams.xml
+++ b/docs_raw/source/diagrams.xml
@@ -1 +1 @@

\ No newline at end of file

\ No newline at end of file
diff --git a/rl_coach/agents/agent.py b/rl_coach/agents/agent.py
index 3db0aaf..5d12e0b 100644
--- a/rl_coach/agents/agent.py
+++ b/rl_coach/agents/agent.py
@@ -1003,7 +1003,7 @@ class Agent(AgentInterface):
"""
Allows setting a directive for the agent to follow. This is useful in hierarchy structures, where the agent
has another master agent that is controlling it. In such cases, the master agent can define the goals for the
- slave agent, define it's observation, possible actions, etc. The directive type is defined by the agent
+ slave agent, define its observation, possible actions, etc. The directive type is defined by the agent
in-action-space.
:param action: The action that should be set as the directive
diff --git a/rl_coach/agents/wolpertinger_agent.py b/rl_coach/agents/wolpertinger_agent.py
new file mode 100644
index 0000000..a16b9e9
--- /dev/null
+++ b/rl_coach/agents/wolpertinger_agent.py
@@ -0,0 +1,131 @@
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import copy
+from typing import Union
+from collections import OrderedDict
+import numpy as np
+
+from rl_coach.agents.ddpg_agent import DDPGAlgorithmParameters, DDPGActorNetworkParameters, \
+ DDPGCriticNetworkParameters, DDPGAgent
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import ActionInfo
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
+from rl_coach.memories.non_episodic.differentiable_neural_dictionary import AnnoyDictionary
+from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
+from rl_coach.architectures.head_parameters import WolpertingerActorHeadParameters
+
+
+class WolpertingerCriticNetworkParameters(DDPGCriticNetworkParameters):
+ def __init__(self, use_batchnorm=False):
+ super().__init__(use_batchnorm=use_batchnorm)
+
+
+class WolpertingerActorNetworkParameters(DDPGActorNetworkParameters):
+ def __init__(self, use_batchnorm=False):
+ super().__init__()
+ self.heads_parameters = [WolpertingerActorHeadParameters(batchnorm=use_batchnorm)]
+
+
+class WolpertingerAlgorithmParameters(DDPGAlgorithmParameters):
+ def __init__(self):
+ super().__init__()
+ self.action_embedding_width = 1
+ self.k = 1
+
+
+class WolpertingerAgentParameters(AgentParameters):
+ def __init__(self, use_batchnorm=False):
+ exploration_params = AdditiveNoiseParameters()
+ exploration_params.noise_as_percentage_from_action_space = False
+
+ super().__init__(algorithm=WolpertingerAlgorithmParameters(),
+ exploration=exploration_params,
+ memory=EpisodicExperienceReplayParameters(),
+ networks=OrderedDict(
+ [("actor", WolpertingerActorNetworkParameters(use_batchnorm=use_batchnorm)),
+ ("critic", WolpertingerCriticNetworkParameters(use_batchnorm=use_batchnorm))]))
+
+ @property
+ def path(self):
+ return 'rl_coach.agents.wolpertinger_agent:WolpertingerAgent'
+
+
+# Deep Reinforcement Learning in Large Discrete Action Spaces - https://arxiv.org/pdf/1512.07679.pdf
+class WolpertingerAgent(DDPGAgent):
+ def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent'] = None):
+ super().__init__(agent_parameters, parent)
+
+ def learn_from_batch(self, batch):
+ # replay buffer holds the actions in the discrete manner, as the agent is expected to act with discrete actions
+ # with the BoxDiscretization output filter. But DDPG needs to work on continuous actions, thus converting to
+ # continuous actions. This is actually a duplicate since this filtering is also done before applying actions on
+ # the environment. So might want to somehow reuse that conversion. Maybe can hold this information in the info
+ # dictionary of the transition.
+
+ output_action_filter = \
+ list(self.output_filter.action_filters.values())[0]
+ continuous_actions = []
+ for action in batch.actions():
+ continuous_actions.append(output_action_filter.filter(action))
+ batch._actions = np.array(continuous_actions).squeeze()
+
+ return super().learn_from_batch(batch)
+
+ def train(self):
+ return super().train()
+
+ def choose_action(self, curr_state):
+ if not isinstance(self.spaces.action, DiscreteActionSpace):
+ raise ValueError("WolpertingerAgent works only for discrete control problems")
+
+ # convert to batch so we can run it through the network
+ tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
+ actor_network = self.networks['actor'].online_network
+ critic_network = self.networks['critic'].online_network
+ proto_action = actor_network.predict(tf_input_state)
+ proto_action = np.expand_dims(self.exploration_policy.get_action(proto_action), 0)
+
+ nn_action_embeddings, indices, _, _ = self.knn_tree.query(keys=proto_action, k=self.ap.algorithm.k)
+
+ # now move the actions through the critic and choose the one with the highest q value
+ critic_inputs = copy.copy(tf_input_state)
+ critic_inputs['observation'] = np.tile(critic_inputs['observation'], (self.ap.algorithm.k, 1))
+ critic_inputs['action'] = nn_action_embeddings[0]
+ q_values = critic_network.predict(critic_inputs)[0]
+ action = int(indices[0][np.argmax(q_values)])
+ self.action_signal.add_sample(action)
+ return ActionInfo(action=action, action_value=0)
+
+ def init_environment_dependent_modules(self):
+ super().init_environment_dependent_modules()
+ self.knn_tree = self.get_initialized_knn()
+
+ # TODO - ideally the knn should not be defined here, but somehow be defined by the user in the preset
+ def get_initialized_knn(self):
+ num_actions = len(self.spaces.action.actions)
+ action_max_abs_range = self.spaces.action.filtered_action_space.max_abs_range if \
+ (hasattr(self.spaces.action, 'filtered_action_space') and
+ isinstance(self.spaces.action.filtered_action_space, BoxActionSpace)) \
+ else 1.0
+ keys = np.expand_dims((np.arange(num_actions) / (num_actions - 1) - 0.5) * 2, 1) * action_max_abs_range
+ values = np.expand_dims(np.arange(num_actions), 1)
+ knn_tree = AnnoyDictionary(dict_size=num_actions, key_width=self.ap.algorithm.action_embedding_width)
+ knn_tree.add(keys, values, force_rebuild_tree=True)
+
+ return knn_tree
+
diff --git a/rl_coach/architectures/head_parameters.py b/rl_coach/architectures/head_parameters.py
index ee607dd..207ea3e 100644
--- a/rl_coach/architectures/head_parameters.py
+++ b/rl_coach/architectures/head_parameters.py
@@ -108,6 +108,17 @@ class DDPGActorHeadParameters(HeadParameters):
self.batchnorm = batchnorm
+class WolpertingerActorHeadParameters(HeadParameters):
+ def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True,
+ num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
+ loss_weight: float = 1.0, dense_layer=None):
+ super().__init__(parameterized_class_name="WolpertingerActorHead", activation_function=activation_function, name=name,
+ dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
+ rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
+ loss_weight=loss_weight)
+ self.batchnorm = batchnorm
+
+
class DNDQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
diff --git a/rl_coach/architectures/tensorflow_components/heads/__init__.py b/rl_coach/architectures/tensorflow_components/heads/__init__.py
index 03c237a..0a83399 100644
--- a/rl_coach/architectures/tensorflow_components/heads/__init__.py
+++ b/rl_coach/architectures/tensorflow_components/heads/__init__.py
@@ -18,6 +18,7 @@ from .classification_head import ClassificationHead
from .cil_head import RegressionHead
from .td3_v_head import TD3VHead
from .ddpg_v_head import DDPGVHead
+from .wolpertinger_actor_head import WolpertingerActorHead
__all__ = [
'CategoricalQHead',
@@ -38,6 +39,7 @@ __all__ = [
'SACQHead',
'ClassificationHead',
'RegressionHead',
- 'TD3VHead'
- 'DDPGVHead'
+ 'TD3VHead',
+ 'DDPGVHead',
+ 'WolpertingerActorHead'
]
diff --git a/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py b/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py
new file mode 100644
index 0000000..3521a95
--- /dev/null
+++ b/rl_coach/architectures/tensorflow_components/heads/wolpertinger_actor_head.py
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import tensorflow as tf
+
+from rl_coach.architectures.tensorflow_components.layers import batchnorm_activation_dropout, Dense
+from rl_coach.architectures.tensorflow_components.heads.head import Head
+from rl_coach.base_parameters import AgentParameters
+from rl_coach.core_types import Embedding
+from rl_coach.spaces import SpacesDefinition, BoxActionSpace
+
+
+class WolpertingerActorHead(Head):
+ def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
+ head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
+ batchnorm: bool=True, dense_layer=Dense, is_training=False):
+ super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
+ dense_layer=dense_layer, is_training=is_training)
+ self.name = 'wolpertinger_actor_head'
+ self.return_type = Embedding
+ self.action_embedding_width = agent_parameters.algorithm.action_embedding_width
+ self.batchnorm = batchnorm
+ self.output_scale = self.spaces.action.filtered_action_space.max_abs_range if \
+ (hasattr(self.spaces.action, 'filtered_action_space') and
+ isinstance(self.spaces.action.filtered_action_space, BoxActionSpace)) \
+ else None
+
+ def _build_module(self, input_layer):
+ # mean
+ pre_activation_policy_value = self.dense_layer(self.action_embedding_width)(input_layer,
+ name='actor_action_embedding')
+ self.proto_action = batchnorm_activation_dropout(input_layer=pre_activation_policy_value,
+ batchnorm=self.batchnorm,
+ activation_function=self.activation_function,
+ dropout_rate=0,
+ is_training=self.is_training,
+ name="BatchnormActivationDropout_0")[-1]
+ if self.output_scale is not None:
+ self.proto_action = tf.multiply(self.proto_action, self.output_scale, name='proto_action')
+
+ self.output = [self.proto_action]
+
+ def __str__(self):
+ result = [
+ 'Dense (num outputs = {})'.format(self.action_embedding_width)
+ ]
+ return '\n'.join(result)
diff --git a/rl_coach/exploration_policies/additive_noise.py b/rl_coach/exploration_policies/additive_noise.py
index 8194718..8b67c7d 100644
--- a/rl_coach/exploration_policies/additive_noise.py
+++ b/rl_coach/exploration_policies/additive_noise.py
@@ -62,7 +62,9 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
self.evaluation_noise = evaluation_noise
self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space
- if not isinstance(action_space, BoxActionSpace):
+ if not isinstance(action_space, BoxActionSpace) and \
+ (hasattr(action_space, 'filtered_action_space') and not
+ isinstance(action_space.filtered_action_space, BoxActionSpace)):
raise ValueError("Additive noise exploration works only for continuous controls."
"The given action space is of type: {}".format(action_space.__class__.__name__))
diff --git a/rl_coach/exploration_policies/exploration_policy.py b/rl_coach/exploration_policies/exploration_policy.py
index a345895..688fcce 100644
--- a/rl_coach/exploration_policies/exploration_policy.py
+++ b/rl_coach/exploration_policies/exploration_policy.py
@@ -115,5 +115,8 @@ class ContinuousActionExplorationPolicy(ExplorationPolicy):
"""
:param action_space: the action space used by the environment
"""
- assert isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)
+ assert isinstance(action_space, BoxActionSpace) or \
+ (hasattr(action_space, 'filtered_action_space') and
+ isinstance(action_space.filtered_action_space, BoxActionSpace)) or \
+ isinstance(action_space, GoalsSpace)
super().__init__(action_space)
diff --git a/rl_coach/filters/action/partial_discrete_action_space_map.py b/rl_coach/filters/action/partial_discrete_action_space_map.py
index 2322698..ad6e105 100644
--- a/rl_coach/filters/action/partial_discrete_action_space_map.py
+++ b/rl_coach/filters/action/partial_discrete_action_space_map.py
@@ -48,7 +48,8 @@ class PartialDiscreteActionSpaceMap(ActionFilter):
def get_unfiltered_action_space(self, output_action_space: ActionSpace) -> DiscreteActionSpace:
self.output_action_space = output_action_space
- self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions)
+ self.input_action_space = DiscreteActionSpace(len(self.target_actions), self.descriptions,
+ filtered_action_space=output_action_space)
return self.input_action_space
def filter(self, action: ActionType) -> ActionType:
diff --git a/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py b/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py
index 3368ee8..8633118 100644
--- a/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py
+++ b/rl_coach/memories/non_episodic/differentiable_neural_dictionary.py
@@ -57,7 +57,7 @@ class AnnoyDictionary(object):
self.built_capacity = 0
- def add(self, keys, values, additional_data=None):
+ def add(self, keys, values, additional_data=None, force_rebuild_tree=False):
if not additional_data:
additional_data = [None] * len(keys)
@@ -96,7 +96,7 @@ class AnnoyDictionary(object):
if len(self.buffered_indices) >= self.min_update_size:
self.min_update_size = max(self.initial_update_size, int(self.curr_size * 0.02))
self._rebuild_index()
- elif self.rebuild_on_every_update:
+ elif force_rebuild_tree or self.rebuild_on_every_update:
self._rebuild_index()
self.current_timestamp += 1
diff --git a/rl_coach/presets/Mujoco_Wolpertinger.py b/rl_coach/presets/Mujoco_Wolpertinger.py
new file mode 100644
index 0000000..f12e41c
--- /dev/null
+++ b/rl_coach/presets/Mujoco_Wolpertinger.py
@@ -0,0 +1,57 @@
+from collections import OrderedDict
+
+from rl_coach.architectures.layers import Dense
+from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme
+from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
+from rl_coach.environments.environment import SingleLevelSelection
+from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2
+from rl_coach.filters.action import BoxDiscretization
+from rl_coach.filters.filter import OutputFilter
+from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
+from rl_coach.graph_managers.graph_manager import ScheduleParameters
+from rl_coach.agents.wolpertinger_agent import WolpertingerAgentParameters
+
+####################
+# Graph Scheduling #
+####################
+schedule_params = ScheduleParameters()
+schedule_params.improve_steps = EnvironmentSteps(2000000)
+schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
+schedule_params.evaluation_steps = EnvironmentEpisodes(1)
+schedule_params.heatup_steps = EnvironmentSteps(3000)
+
+#########
+# Agent #
+#########
+agent_params = WolpertingerAgentParameters()
+agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)]
+agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)]
+agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense(400)]
+agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(300)]
+agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
+agent_params.output_filter = \
+ OutputFilter(
+ action_filters=OrderedDict([
+ ('discretization', BoxDiscretization(num_bins_per_dimension=int(1e6)))
+ ]),
+ is_a_reference_filter=False
+ )
+
+###############
+# Environment #
+###############
+env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
+
+########
+# Test #
+########
+preset_validation_params = PresetValidationParameters()
+preset_validation_params.test = True
+preset_validation_params.min_reward_threshold = 500
+preset_validation_params.max_episodes_to_achieve_reward = 1000
+preset_validation_params.reward_test_level = 'inverted_pendulum'
+preset_validation_params.trace_test_levels = ['inverted_pendulum']
+
+graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
+ schedule_params=schedule_params, vis_params=VisualizationParameters(),
+ preset_validation_params=preset_validation_params)
diff --git a/rl_coach/spaces.py b/rl_coach/spaces.py
index 503598c..5dcaa2b 100644
--- a/rl_coach/spaces.py
+++ b/rl_coach/spaces.py
@@ -385,7 +385,8 @@ class DiscreteActionSpace(ActionSpace):
"""
A discrete action space with action indices as actions
"""
- def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None):
+ def __init__(self, num_actions: int, descriptions: Union[None, List, Dict]=None, default_action: np.ndarray=None,
+ filtered_action_space=None):
super().__init__(1, low=0, high=num_actions-1, descriptions=descriptions)
# the number of actions is mapped to high
@@ -395,6 +396,9 @@ class DiscreteActionSpace(ActionSpace):
else:
self.default_action = default_action
+ if filtered_action_space is not None:
+ self.filtered_action_space = filtered_action_space
+
@property
def actions(self) -> List[ActionType]:
return list(range(0, int(self.high[0]) + 1))