[docs]classAgent(AgentInterface):
@@ -222,6 +225,15 @@
:param agent_parameters: A AgentParameters class instance with all the agent parameters """super().__init__()
+ # use seed
+ ifagent_parameters.task_parameters.seedisnotNone:
+ random.seed(agent_parameters.task_parameters.seed)
+ np.random.seed(agent_parameters.task_parameters.seed)
+ else:
+ # we need to seed the RNG since the different processes are initialized with the same parent seed
+ random.seed()
+ np.random.seed()
+
self.ap=agent_parametersself.task_id=self.ap.task_parameters.task_indexself.is_chief=self.task_id==0
@@ -229,10 +241,10 @@
andself.ap.memory.shared_memoryifself.shared_memory:self.shared_memory_scratchpad=self.ap.task_parameters.shared_memory_scratchpad
- self.name=agent_parameters.nameself.parent=parentself.parent_level_manager=None
- self.full_name_id=agent_parameters.full_name_id=self.name
+ # TODO this needs to be sorted out. Why the duplicates for the agent's name?
+ self.full_name_id=agent_parameters.full_name_id=self.name=agent_parameters.nameiftype(agent_parameters.task_parameters)==DistributedTaskParameters:screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
@@ -264,9 +276,17 @@
self.memory.set_memory_backend(self.memory_backend)ifagent_parameters.memory.load_memory_from_file_path:
- screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
- .format(agent_parameters.memory.load_memory_from_file_path))
- self.memory.load(agent_parameters.memory.load_memory_from_file_path)
+ ifisinstance(agent_parameters.memory.load_memory_from_file_path,PickledReplayBuffer):
+ screen.log_title("Loading a pickled replay buffer. Pickled file path: {}"
+ .format(agent_parameters.memory.load_memory_from_file_path.filepath))
+ self.memory.load_pickled(agent_parameters.memory.load_memory_from_file_path.filepath)
+ elifisinstance(agent_parameters.memory.load_memory_from_file_path,CsvDataset):
+ screen.log_title("Loading a replay buffer from a CSV file. CSV file path: {}"
+ .format(agent_parameters.memory.load_memory_from_file_path.filepath))
+ self.memory.load_csv(agent_parameters.memory.load_memory_from_file_path)
+ else:
+ raiseValueError('Trying to load a replay buffer using an unsupported method - {}. '
+ .format(agent_parameters.memory.load_memory_from_file_path))ifself.shared_memoryandself.is_chief:self.shared_memory_scratchpad.add(self.memory_lookup_name,self.memory)
@@ -327,6 +347,7 @@
self.total_steps_counter=0self.running_reward=Noneself.training_iteration=0
+ self.training_epoch=0self.last_target_network_update_step=0self.last_training_phase_step=0self.current_episode=self.ap.current_episode=0
@@ -364,14 +385,9 @@
self.discounted_return=self.register_signal('Discounted Return')ifisinstance(self.in_action_space,GoalsSpace):self.distance_from_goal=self.register_signal('Distance From Goal',dump_one_value_per_step=True)
- # use seed
- ifself.ap.task_parameters.seedisnotNone:
- random.seed(self.ap.task_parameters.seed)
- np.random.seed(self.ap.task_parameters.seed)
- else:
- # we need to seed the RNG since the different processes are initialized with the same parent seed
- random.seed()
- np.random.seed()
+
+ # batch rl
+ self.ope_manager=OpeManager()ifself.ap.is_batch_rl_trainingelseNone@propertydefparent(self)->'LevelManager':
@@ -408,6 +424,7 @@
format(graph_name=self.parent_level_manager.parent_graph_manager.name,level_name=self.parent_level_manager.name,agent_full_id='.'.join(self.full_name_id.split('/')))
+ self.agent_logger.set_index_name(self.parent_level_manager.parent_graph_manager.time_metric.value.name)self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,logger_prefix=logger_prefix,add_timestamp=True,task_id=self.task_id)ifself.ap.visualization.dump_in_episode_signals:
@@ -561,13 +578,17 @@
self.accumulated_shaped_rewards_across_evaluation_episodes=0self.num_successes_across_evaluation_episodes=0self.num_evaluation_episodes_completed=0
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:screen.log_title("{}: Starting evaluation phase".format(self.name))elifending_evaluation:# we write to the next episode, because it could be that the current episode was already written# to disk and then we won't write it again
- self.agent_logger.set_current_time(self.current_episode+1)
+ self.agent_logger.set_current_time(self.get_current_time()+1)
+
evaluation_reward=self.accumulated_rewards_across_evaluation_episodes/self.num_evaluation_episodes_completedself.agent_logger.create_signal_value('Evaluation Reward',evaluation_reward)
@@ -577,9 +598,11 @@
success_rate=self.num_successes_across_evaluation_episodes/self.num_evaluation_episodes_completedself.agent_logger.create_signal_value("Success Rate",
- success_rate
- )
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+ success_rate)
+
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:screen.log_title("{}: Finished evaluation phase. Success rate = {}, Avg Total Reward = {}".format(self.name,np.round(success_rate,2),np.round(evaluation_reward,2)))
@@ -652,8 +675,11 @@
:return: None """# log all the signals to file
- self.agent_logger.set_current_time(self.current_episode)
+ current_time=self.get_current_time()
+ self.agent_logger.set_current_time(current_time)self.agent_logger.create_signal_value('Training Iter',self.training_iteration)
+ self.agent_logger.create_signal_value('Episode #',self.current_episode)
+ self.agent_logger.create_signal_value('Epoch',self.training_epoch)self.agent_logger.create_signal_value('In Heatup',int(self._phase==RunPhase.HEATUP))self.agent_logger.create_signal_value('ER #Transitions',self.call_memory('num_transitions'))self.agent_logger.create_signal_value('ER #Episodes',self.call_memory('length'))
@@ -666,12 +692,17 @@
ifself._phase==RunPhase.TRAINelsenp.nan)self.agent_logger.create_signal_value('Update Target Network',0,overwrite=False)
- self.agent_logger.update_wall_clock_time(self.current_episode)
+ self.agent_logger.update_wall_clock_time(current_time)
- ifself._phase!=RunPhase.TEST:
- self.agent_logger.create_signal_value('Evaluation Reward',np.nan,overwrite=False)
- self.agent_logger.create_signal_value('Shaped Evaluation Reward',np.nan,overwrite=False)
- self.agent_logger.create_signal_value('Success Rate',np.nan,overwrite=False)
+ # The following signals are created with meaningful values only when an evaluation phase is completed.
+ # Creating with default NaNs for any HEATUP/TRAIN/TEST episode which is not the last in an evaluation phase
+ self.agent_logger.create_signal_value('Evaluation Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Shaped Evaluation Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Success Rate',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Inverse Propensity Score',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Direct Method Reward',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Doubly Robust',np.nan,overwrite=False)
+ self.agent_logger.create_signal_value('Sequential Doubly Robust',np.nan,overwrite=False)forsignalinself.episode_signals:self.agent_logger.create_signal_value("{}/Mean".format(signal.name),signal.get_mean())
@@ -680,8 +711,7 @@
self.agent_logger.create_signal_value("{}/Min".format(signal.name),signal.get_min())# dump
- ifself.current_episode%self.ap.visualization.dump_signals_to_csv_every_x_episodes==0 \
- andself.current_episode>0:
+ ifself.current_episode%self.ap.visualization.dump_signals_to_csv_every_x_episodes==0:self.agent_logger.dump_output_csv()
[docs]defhandle_episode_ended(self)->None:
@@ -717,10 +747,13 @@
self.total_reward_in_current_episode>=self.spaces.reward.reward_success_threshold:self.num_successes_across_evaluation_episodes+=1
- ifself.ap.visualization.dump_csv:
+ ifself.ap.visualization.dump_csvand \
+ self.parent_level_manager.parent_graph_manager.time_metric==TimeTypes.EpisodeNumber:self.update_log()
- ifself.ap.is_a_highest_level_agentorself.ap.task_parameters.verbosity=="high":
+ # TODO verbosity was mistakenly removed from task_parameters on release 0.11.0, need to bring it back
+ # if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
+ ifself.ap.is_a_highest_level_agent:self.log_to_screen()
[docs]defreset_internal_state(self)->None:
@@ -831,18 +864,25 @@
"""loss=0ifself._should_train():
+ self.training_epoch+=1fornetworkinself.networks.values():network.set_is_training(True)
- fortraining_stepinrange(self.ap.algorithm.num_consecutive_training_steps):
- # TODO: this should be network dependent
- network_parameters=list(self.ap.network_wrappers.values())[0]
+ # At the moment we only support a single batch size for all the networks
+ networks_parameters=list(self.ap.network_wrappers.values())
+ assertall(net.batch_size==networks_parameters[0].batch_sizefornetinnetworks_parameters)
+ batch_size=networks_parameters[0].batch_size
+
+ # we either go sequentially through the entire replay buffer in the batch RL mode,
+ # or sample randomly for the basic RL case.
+ training_schedule=self.call_memory('get_shuffled_data_generator',batch_size)if \
+ self.ap.is_batch_rl_trainingelse[self.call_memory('sample',batch_size)for_in
+ range(self.ap.algorithm.num_consecutive_training_steps)]
+
+ forbatchintraining_schedule:# update countersself.training_iteration+=1
-
- # sample a batch and train on it
- batch=self.call_memory('sample',network_parameters.batch_size)ifself.pre_network_filterisnotNone:batch=self.pre_network_filter.filter(batch,update_internal_state=False,deep_copy=False)
@@ -853,15 +893,19 @@
batch=Batch(batch)total_loss,losses,unclipped_grads=self.learn_from_batch(batch)loss+=total_loss
+
self.unclipped_grads.add_sample(unclipped_grads)
- # TODO: the learning rate decay should be done through the network instead of here
+ # TODO: this only deals with the main network (if exists), need to do the same for other networks
+ # for instance, for DDPG, the LR signal is currently not shown. Probably should be done through the
+ # network directly instead of here# decay learning rate
- ifnetwork_parameters.learning_rate_decay_rate!=0:
+ if'main'inself.ap.network_wrappersand \
+ self.ap.network_wrappers['main'].learning_rate_decay_rate!=0:self.curr_learning_rate.add_sample(self.networks['main'].sess.run(self.networks['main'].online_network.current_learning_rate))else:
- self.curr_learning_rate.add_sample(network_parameters.learning_rate)
+ self.curr_learning_rate.add_sample(networks_parameters[0].learning_rate)ifany([network.has_targetfornetworkinself.networks.values()]) \
andself._should_update_online_weights_to_target():
@@ -877,6 +921,12 @@
ifself.imitation:self.log_to_screen()
+ ifself.ap.visualization.dump_csvand \
+ self.parent_level_manager.parent_graph_manager.time_metric==TimeTypes.Epoch:
+ # in BatchRL, or imitation learning, the agent never acts, so we have to get the stats out here.
+ # we dump the data out every epoch
+ self.update_log()
+
fornetworkinself.networks.values():network.set_is_training(False)
@@ -919,10 +969,11 @@
returnbatches_dict
[docs]defact(self,action:Union[None,ActionType]=None)->ActionInfo:""" Given the agents current knowledge, decide on the next action to apply to the environment
+ :param action: An action to take, overriding whatever the current policy is :return: An ActionInfo object, which contains the action and any additional info from the action decision process """ifself.phase==RunPhase.TRAINandself.ap.algorithm.num_consecutive_playing_steps.num_steps==0:
@@ -935,20 +986,28 @@
self.current_episode_steps_counter+=1# decide on the action
- ifself.phase==RunPhase.HEATUPandnotself.ap.algorithm.heatup_using_network_decisions:
- # random action
- self.last_action_info=self.spaces.action.sample_with_info()
- else:
- # informed action
- ifself.pre_network_filterisnotNone:
- # before choosing an action, first use the pre_network_filter to filter out the current state
- update_filter_internal_state=self.phaseisnotRunPhase.TEST
- curr_state=self.run_pre_network_filter_for_inference(self.curr_state,update_filter_internal_state)
-
+ ifactionisNone:
+ ifself.phase==RunPhase.HEATUPandnotself.ap.algorithm.heatup_using_network_decisions:
+ # random action
+ action=self.spaces.action.sample_with_info()else:
- curr_state=self.curr_state
- self.last_action_info=self.choose_action(curr_state)
+ # informed action
+ ifself.pre_network_filterisnotNone:
+ # before choosing an action, first use the pre_network_filter to filter out the current state
+ update_filter_internal_state=self.phaseisnotRunPhase.TEST
+ curr_state=self.run_pre_network_filter_for_inference(self.curr_state,update_filter_internal_state)
+ else:
+ curr_state=self.curr_state
+ action=self.choose_action(curr_state)
+ assertisinstance(action,ActionInfo)
+
+ self.last_action_info=action
+
+ # output filters are explicitly applied after recording self.last_action_info. This is
+ # because the output filters may change the representation of the action so that the agent
+ # can no longer use the transition in it's replay buffer. It is possible that these filters
+ # could be moved to the environment instead, but they are here now for historical reasons.filtered_action_info=self.output_filter.filter(self.last_action_info)returnfiltered_action_info
@@ -1030,37 +1089,35 @@
# make agent specific changes to the transition if neededtransition=self.update_transition_before_adding_to_replay_buffer(transition)
- # merge the intrinsic reward in
- ifself.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
- transition.reward=transition.reward*(1+self.last_action_info.action_intrinsic_reward)
- else:
- transition.reward=transition.reward+self.last_action_info.action_intrinsic_reward
-
- # sum up the total shaped reward
- self.total_shaped_reward_in_current_episode+=transition.reward
- self.total_reward_in_current_episode+=env_response.reward
- self.shaped_reward.add_sample(transition.reward)
- self.reward.add_sample(env_response.reward)
-
# add action info to transitioniftype(self.parent).__name__=='CompositeAgent':transition.add_info(self.parent.last_action_info.__dict__)else:transition.add_info(self.last_action_info.__dict__)
- # create and store the transition
- ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
- # for episodic memories we keep the transitions in a local buffer until the episode is ended.
- # for regular memories we insert the transitions directly to the memory
- self.current_episode_buffer.insert(transition)
- ifnotisinstance(self.memory,EpisodicExperienceReplay) \
- andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
- self.call_memory('store',transition)
+ self.total_reward_in_current_episode+=env_response.reward
+ self.reward.add_sample(env_response.reward)
- ifself.ap.visualization.dump_in_episode_signals:
- self.update_step_in_episode_log()
+ returnself.observe_transition(transition)
- returntransition.game_over
+ defobserve_transition(self,transition):
+ # sum up the total shaped reward
+ self.total_shaped_reward_in_current_episode+=transition.reward
+ self.shaped_reward.add_sample(transition.reward)
+
+ # create and store the transition
+ ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
+ # for episodic memories we keep the transitions in a local buffer until the episode is ended.
+ # for regular memories we insert the transitions directly to the memory
+ self.current_episode_buffer.insert(transition)
+ ifnotisinstance(self.memory,EpisodicExperienceReplay) \
+ andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
+ self.call_memory('store',transition)
+
+ ifself.ap.visualization.dump_in_episode_signals:
+ self.update_step_in_episode_log()
+
+ returntransition.game_over
- # TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
- # an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
-
[docs]defemulate_observe_on_trainer(self,transition:Transition)->bool:
- """
- This emulates the observe using the transition obtained from the rollout worker on the training worker
- in case of distributed training.
- Given a response from the environment, distill the observation from it and store it for later use.
- The response should be a dictionary containing the performed action, the new observation and measurements,
- the reward, a game over flag and any additional information necessary.
- :return:
- """
-
- # sum up the total shaped reward
- self.total_shaped_reward_in_current_episode+=transition.reward
- self.total_reward_in_current_episode+=transition.reward
- self.shaped_reward.add_sample(transition.reward)
- self.reward.add_sample(transition.reward)
-
- # create and store the transition
- ifself.phasein[RunPhase.TRAIN,RunPhase.HEATUP]:
- # for episodic memories we keep the transitions in a local buffer until the episode is ended.
- # for regular memories we insert the transitions directly to the memory
- self.current_episode_buffer.insert(transition)
- ifnotisinstance(self.memory,EpisodicExperienceReplay) \
- andnotself.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
- self.call_memory('store',transition)
-
- ifself.ap.visualization.dump_in_episode_signals:
- self.update_step_in_episode_log()
-
- returntransition.game_over
-
- # TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
- # an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
-
[docs]defemulate_act_on_trainer(self,transition:Transition)->ActionInfo:
- """
- This emulates the act using the transition obtained from the rollout worker on the training worker
- in case of distributed training.
- Given the agents current knowledge, decide on the next action to apply to the environment
- :return: an action and a dictionary containing any additional info from the action decision process
- """
- ifself.phase==RunPhase.TRAINandself.ap.algorithm.num_consecutive_playing_steps.num_steps==0:
- # This agent never plays while training (e.g. behavioral cloning)
- returnNone
-
- # count steps (only when training or if we are in the evaluation worker)
- ifself.phase!=RunPhase.TESTorself.ap.task_parameters.evaluate_only:
- self.total_steps_counter+=1
- self.current_episode_steps_counter+=1
-
- self.last_action_info=transition.action
-
- returnself.last_action_info
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+fromtypingimportUnion,List,Dict
+
+importnumpyasnp
+
+fromrl_coach.core_typesimportEnvResponse,ActionInfo,RunPhase,PredictionType,ActionType,Transition
+fromrl_coach.saverimportSaverCollection
+
+
+classAgentInterface(object):
+ def__init__(self):
+ self._phase=RunPhase.HEATUP
+ self._parent=None
+ self.spaces=None
+
+ @property
+ defparent(self):
+ """
+ Get the parent class of the agent
+ :return: the current phase
+ """
+ returnself._parent
+
+ @parent.setter
+ defparent(self,val):
+ """
+ Change the parent class of the agent
+ :param val: the new parent
+ :return: None
+ """
+ self._parent=val
+
+ @property
+ defphase(self)->RunPhase:
+ """
+ Get the phase of the agent
+ :return: the current phase
+ """
+ returnself._phase
+
+ @phase.setter
+ defphase(self,val:RunPhase):
+ """
+ Change the phase of the agent
+ :param val: the new phase
+ :return: None
+ """
+ self._phase=val
+
+ defreset_internal_state(self)->None:
+ """
+ Reset the episode parameters for the agent
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ deftrain(self)->Union[float,List]:
+ """
+ Train the agents network
+ :return: The loss of the training
+ """
+ raiseNotImplementedError("")
+
+ defact(self)->ActionInfo:
+ """
+ Get a decision of the next action to take.
+ The action is dependent on the current state which the agent holds from resetting the environment or from
+ the observe function.
+ :return: A tuple containing the actual action and additional info on the action
+ """
+ raiseNotImplementedError("")
+
+ defobserve(self,env_response:EnvResponse)->bool:
+ """
+ Gets a response from the environment.
+ Processes this information for later use. For example, create a transition and store it in memory.
+ The action info (a class containing any info the agent wants to store regarding its action decision process) is
+ stored by the agent itself when deciding on the action.
+ :param env_response: a EnvResponse containing the response from the environment
+ :return: a done signal which is based on the agent knowledge. This can be different from the done signal from
+ the environment. For example, an agent can decide to finish the episode each time it gets some
+ intrinsic reward
+ """
+ raiseNotImplementedError("")
+
+ defsave_checkpoint(self,checkpoint_prefix:str)->None:
+ """
+ Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
+ :param checkpoint_prefix: The prefix of the checkpoint file to save
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defget_predictions(self,states:Dict,prediction_type:PredictionType)->np.ndarray:
+ """
+ Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
+ type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
+ :param states:
+ :param prediction_type:
+ :return: the agent's prediction
+ """
+ raiseNotImplementedError("")
+
+ defset_incoming_directive(self,action:ActionType)->None:
+ """
+ Pass a higher level command (directive) to the agent.
+ For example, a higher level agent can set the goal of the agent.
+ :param action: the directive to pass to the agent
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defcollect_savers(self,parent_path_suffix:str)->SaverCollection:
+ """
+ Collect all of agent savers
+ :param parent_path_suffix: path suffix of the parent of the agent
+ (could be name of level manager or composite agent)
+ :return: collection of all agent savers
+ """
+ raiseNotImplementedError("")
+
+ defhandle_episode_ended(self)->None:
+ """
+ Make any changes needed when each episode is ended.
+ This includes incrementing counters, updating full episode dependent values, updating logs, etc.
+ This function is called right after each episode is ended.
+
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+ defrun_off_policy_evaluation(self)->None:
+ """
+ Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
+ Should only be implemented for off-policy RL algorithms.
+
+ :return: None
+ """
+ raiseNotImplementedError("")
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
index 667a484..0b81831 100644
--- a/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/categorical_dqn_agent.html
@@ -264,8 +264,8 @@
# prediction's format is (batch,actions,atoms)defget_all_q_values_for_states(self,states:StateType):ifself.exploration_policy.requires_action_values():
- prediction=self.get_prediction(states)
- q_values=self.distribution_prediction_to_q_values(prediction)
+ q_values=self.get_prediction(states,
+ outputs=[self.networks['main'].online_network.output_heads[0].q_values])else:q_values=Nonereturnq_values
@@ -280,11 +280,14 @@
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.distribution_prediction_to_q_values(TD_targets))
+
# select the optimal actions for the next statetarget_actions=np.argmax(self.distribution_prediction_to_q_values(distributional_q_st_plus_1),axis=1)
- m=np.zeros((self.ap.network_wrappers['main'].batch_size,self.z_values.size))
+ m=np.zeros((batch.size,self.z_values.size))
- batches=np.arange(self.ap.network_wrappers['main'].batch_size)
+ batches=np.arange(batch.size)# an alternative to the for loop. 3.7x perf improvement vs. the same code done with for looping.# only 10% speedup overall - leaving commented out as the code is not as clear.
@@ -297,7 +300,7 @@
# bj_ = (tzj_ - self.z_values[0]) / (self.z_values[1] - self.z_values[0])# u_ = (np.ceil(bj_)).astype(int)# l_ = (np.floor(bj_)).astype(int)
- # m_ = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
+ # m_ = np.zeros((batch.size, self.z_values.size))# np.add.at(m_, [batches, l_],# np.transpose(distributional_q_st_plus_1[batches, target_actions], (1, 0)) * (u_ - bj_))# np.add.at(m_, [batches, u_],
diff --git a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
index ff0397c..05b636e 100644
--- a/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
+++ b/docs/_modules/rl_coach/agents/clipped_ppo_agent.html
@@ -387,6 +387,8 @@
self.networks['main'].online_network.output_heads[1].likelihood_ratio,self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
+ # TODO-fixme if batch.size / self.ap.network_wrappers['main'].batch_size is not an integer, we do not train on
+ # some of the dataforiinrange(int(batch.size/self.ap.network_wrappers['main'].batch_size)):start=i*self.ap.network_wrappers['main'].batch_sizeend=(i+1)*self.ap.network_wrappers['main'].batch_size
diff --git a/docs/_modules/rl_coach/agents/dfp_agent.html b/docs/_modules/rl_coach/agents/dfp_agent.html
index 36aca38..447c31e 100644
--- a/docs/_modules/rl_coach/agents/dfp_agent.html
+++ b/docs/_modules/rl_coach/agents/dfp_agent.html
@@ -326,13 +326,13 @@
network_inputs=batch.states(network_keys)network_inputs['goal']=np.repeat(np.expand_dims(self.current_goal,0),
- self.ap.network_wrappers['main'].batch_size,axis=0)
+ batch.size,axis=0)# get the current outputs of the networktargets=self.networks['main'].online_network.predict(network_inputs)# change the targets for the taken actions
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):targets[i,batch.actions()[i]]=batch[i].info['future_measurements'].flatten()result=self.networks['main'].train_and_sync_networks(network_inputs,targets)
diff --git a/docs/_modules/rl_coach/agents/dqn_agent.html b/docs/_modules/rl_coach/agents/dqn_agent.html
index 5024209..01cd007 100644
--- a/docs/_modules/rl_coach/agents/dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/dqn_agent.html
@@ -250,6 +250,9 @@
def__init__(self,agent_parameters,parent:Union['LevelManager','CompositeAgent']=None):super().__init__(agent_parameters,parent)
+ defselect_actions(self,next_states,q_st_plus_1):
+ returnnp.argmax(q_st_plus_1,1)
+
[docs]deflearn_from_batch(self,batch):network_keys=self.ap.network_wrappers['main'].input_embedders_parameters.keys()
@@ -261,11 +264,16 @@
(self.networks['main'].online_network,batch.states(network_keys))])
+ selected_actions=self.select_actions(batch.next_states(network_keys),q_st_plus_1)
+
+ # add Q value samples for logging
+ self.q_values.add_sample(TD_targets)
+
# only update the action that we have actually done in this transitionTD_errors=[]
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):new_target=batch.rewards()[i]+\
- (1.0-batch.game_overs()[i])*self.ap.algorithm.discount*np.max(q_st_plus_1[i],0)
+ (1.0-batch.game_overs()[i])*self.ap.algorithm.discount*q_st_plus_1[i][selected_actions[i]]TD_errors.append(np.abs(new_target-TD_targets[i,batch.actions()[i]]))TD_targets[i,batch.actions()[i]]=new_target
diff --git a/docs/_modules/rl_coach/agents/mmc_agent.html b/docs/_modules/rl_coach/agents/mmc_agent.html
index ad10e45..ad5c0f2 100644
--- a/docs/_modules/rl_coach/agents/mmc_agent.html
+++ b/docs/_modules/rl_coach/agents/mmc_agent.html
@@ -245,7 +245,7 @@
total_returns=batch.n_step_discounted_rewards()
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):one_step_target=batch.rewards()[i]+ \
(1.0-batch.game_overs()[i])*self.ap.algorithm.discount* \
q_st_plus_1[i][selected_actions[i]]
diff --git a/docs/_modules/rl_coach/agents/n_step_q_agent.html b/docs/_modules/rl_coach/agents/n_step_q_agent.html
index d112464..e827485 100644
--- a/docs/_modules/rl_coach/agents/n_step_q_agent.html
+++ b/docs/_modules/rl_coach/agents/n_step_q_agent.html
@@ -303,6 +303,9 @@
else:assertTrue,'The available values for targets_horizon are: 1-Step, N-Step'
+ # add Q value samples for logging
+ self.q_values.add_sample(state_value_head_targets)
+
# trainresult=self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys),[state_value_head_targets])
diff --git a/docs/_modules/rl_coach/agents/nec_agent.html b/docs/_modules/rl_coach/agents/nec_agent.html
index 5f1e0f4..51d49fd 100644
--- a/docs/_modules/rl_coach/agents/nec_agent.html
+++ b/docs/_modules/rl_coach/agents/nec_agent.html
@@ -313,7 +313,7 @@
TD_targets=self.networks['main'].online_network.predict(batch.states(network_keys))bootstrapped_return_from_old_policy=batch.n_step_discounted_rewards()# only update the action that we have actually done in this transition
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):TD_targets[i,batch.actions()[i]]=bootstrapped_return_from_old_policy[i]# set the gradients to fetch for the DND update
@@ -342,7 +342,7 @@
embedding=self.networks['main'].online_network.predict(self.prepare_batch_for_inference(self.curr_state,'main'),outputs=self.networks['main'].online_network.state_embedding)
- self.current_episode_state_embeddings.append(embedding)
+ self.current_episode_state_embeddings.append(embedding.squeeze())returnsuper().act()
diff --git a/docs/_modules/rl_coach/agents/pal_agent.html b/docs/_modules/rl_coach/agents/pal_agent.html
index 35c8cd3..bc8799b 100644
--- a/docs/_modules/rl_coach/agents/pal_agent.html
+++ b/docs/_modules/rl_coach/agents/pal_agent.html
@@ -264,7 +264,7 @@
# calculate TD errorTD_targets=np.copy(q_st_online)total_returns=batch.n_step_discounted_rewards()
- foriinrange(self.ap.network_wrappers['main'].batch_size):
+ foriinrange(batch.size):TD_targets[i,batch.actions()[i]]=batch.rewards()[i]+ \
(1.0-batch.game_overs()[i])*self.ap.algorithm.discount* \
q_st_plus_1_target[i][selected_actions[i]]
diff --git a/docs/_modules/rl_coach/agents/qr_dqn_agent.html b/docs/_modules/rl_coach/agents/qr_dqn_agent.html
index 568ce17..307e130 100644
--- a/docs/_modules/rl_coach/agents/qr_dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/qr_dqn_agent.html
@@ -268,11 +268,14 @@
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.get_q_values(current_quantiles))
+
# get the optimal actions to take for the next statestarget_actions=np.argmax(self.get_q_values(next_state_quantiles),axis=1)# calculate the Bellman update
- batch_idx=list(range(self.ap.network_wrappers['main'].batch_size))
+ batch_idx=list(range(batch.size))TD_targets=batch.rewards(True)+(1.0-batch.game_overs(True))*self.ap.algorithm.discount \
*next_state_quantiles[batch_idx,target_actions]
@@ -283,9 +286,9 @@
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles ordercumulative_probabilities=np.array(range(self.ap.algorithm.atoms+1))/float(self.ap.algorithm.atoms)# tau_iquantile_midpoints=0.5*(cumulative_probabilities[1:]+cumulative_probabilities[:-1])# tau^hat_i
- quantile_midpoints=np.tile(quantile_midpoints,(self.ap.network_wrappers['main'].batch_size,1))
+ quantile_midpoints=np.tile(quantile_midpoints,(batch.size,1))sorted_quantiles=np.argsort(current_quantiles[batch_idx,batch.actions()])
- foridxinrange(self.ap.network_wrappers['main'].batch_size):
+ foridxinrange(batch.size):quantile_midpoints[idx,:]=quantile_midpoints[idx,sorted_quantiles[idx]]# train
diff --git a/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html b/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
index 542e9e3..601dd13 100644
--- a/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
+++ b/docs/_modules/rl_coach/agents/rainbow_dqn_agent.html
@@ -240,9 +240,12 @@
def__init__(self):super().__init__()self.algorithm=RainbowDQNAlgorithmParameters()
+
+ # ParameterNoiseParameters is changing the network wrapper parameters. This line needs to be done first.
+ self.network_wrappers={"main":RainbowDQNNetworkParameters()}
+
self.exploration=ParameterNoiseParameters(self)self.memory=PrioritizedExperienceReplayParameters()
- self.network_wrappers={"main":RainbowDQNNetworkParameters()}@propertydefpath(self):
@@ -275,11 +278,14 @@
(self.networks['main'].online_network,batch.states(network_keys))])
+ # add Q value samples for logging
+ self.q_values.add_sample(self.distribution_prediction_to_q_values(TD_targets))
+
# only update the action that we have actually done in this transition (using the Double-DQN selected actions)target_actions=ddqn_selected_actions
- m=np.zeros((self.ap.network_wrappers['main'].batch_size,self.z_values.size))
+ m=np.zeros((batch.size,self.z_values.size))
- batches=np.arange(self.ap.network_wrappers['main'].batch_size)
+ batches=np.arange(batch.size)forjinrange(self.z_values.size):# we use batch.info('should_bootstrap_next_state') instead of (1 - batch.game_overs()) since with n-step,# we will not bootstrap for the last n-step transitions in the episode
diff --git a/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html b/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html
new file mode 100644
index 0000000..33668b2
--- /dev/null
+++ b/docs/_modules/rl_coach/agents/soft_actor_critic_agent.html
@@ -0,0 +1,554 @@
+
+
+
+
+
+
+
+
+
+
+ rl_coach.agents.soft_actor_critic_agent — Reinforcement Learning Coach 0.11.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Source code for rl_coach.agents.soft_actor_critic_agent
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+fromtypingimportUnion
+importcopy
+importnumpyasnp
+fromcollectionsimportOrderedDict
+
+fromrl_coach.agents.agentimportAgent
+fromrl_coach.agents.policy_optimization_agentimportPolicyOptimizationAgent
+
+fromrl_coach.architectures.head_parametersimportSACQHeadParameters,SACPolicyHeadParameters,VHeadParameters
+fromrl_coach.architectures.middleware_parametersimportFCMiddlewareParameters
+fromrl_coach.base_parametersimportAlgorithmParameters,NetworkParameters,AgentParameters,EmbedderScheme,MiddlewareScheme
+fromrl_coach.core_typesimportActionInfo,EnvironmentSteps,RunPhase
+fromrl_coach.exploration_policies.additive_noiseimportAdditiveNoiseParameters
+fromrl_coach.memories.non_episodic.experience_replayimportExperienceReplayParameters
+fromrl_coach.architectures.embedder_parametersimportInputEmbedderParameters
+fromrl_coach.spacesimportBoxActionSpace
+
+
+# There are 3 networks in SAC implementation. All have the same topology but parameters are not shared.
+# The networks are:
+# 1. State Value Network - SACValueNetwork
+# 2. Soft Q Value Network - SACCriticNetwork
+# 3. Policy Network - SACPolicyNetwork - currently supporting only Gaussian Policy
+
+
+# 1. State Value Network - SACValueNetwork
+# this is the state value network in SAC.
+# The network is trained to predict (regression) the state value in the max-entropy settings
+# The objective to be minimized is given in equation (5) in the paper:
+#
+# J(psi)= E_(s~D)[0.5*(V_psi(s)-y(s))^2]
+# where y(s) = E_(a~pi)[Q_theta(s,a)-log(pi(a|s))]
+
+
+# Default parameters for value network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme.Medium (Dense(256)) , relu activation
+
+
+classSACValueNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters=FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters=[VHeadParameters(initializer='xavier')]
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003# 3e-4 see appendix D in the paper
+ self.create_target_network=True# tau is set in SoftActorCriticAlgorithmParameters.rate_for_copying_weights_to_target
+
+
+# 2. Soft Q Value Network - SACCriticNetwork
+# the whole network is built in the SACQHeadParameters. we use empty input embedder and middleware
+classSACCriticNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
+ self.middleware_parameters=FCMiddlewareParameters(scheme=MiddlewareScheme.Empty)
+ self.heads_parameters=[SACQHeadParameters()]# SACQHeadParameters includes the topology of the head
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003
+ self.create_target_network=False
+
+
+# 3. policy Network
+# Default parameters for policy network:
+# topology :
+# input embedder : EmbedderScheme.Medium (Dense(256)) , relu activation
+# middleware : EmbedderScheme = [Dense(256)] , relu activation --> scheme should be overridden in preset
+classSACPolicyNetworkParameters(NetworkParameters):
+ def__init__(self):
+ super().__init__()
+ self.input_embedders_parameters={'observation':InputEmbedderParameters(activation_function='relu')}
+ self.middleware_parameters=FCMiddlewareParameters(activation_function='relu')
+ self.heads_parameters=[SACPolicyHeadParameters()]
+ self.rescale_gradient_from_head_by_factor=[1]
+ self.optimizer_type='Adam'
+ self.batch_size=256
+ self.async_training=False
+ self.learning_rate=0.0003
+ self.create_target_network=False
+ self.l2_regularization=0# weight decay regularization. not used in the original paper
+
+
+# Algorithm Parameters
+
+
[docs]classSoftActorCriticAlgorithmParameters(AlgorithmParameters):
+ """
+ :param num_steps_between_copying_online_weights_to_target: (StepMethod)
+ The number of steps between copying the online network weights to the target network weights.
+
+ :param rate_for_copying_weights_to_target: (float)
+ When copying the online network weights to the target network weights, a soft update will be used, which
+ weight the new online network weights by rate_for_copying_weights_to_target. (Tau as defined in the paper)
+
+ :param use_deterministic_for_evaluation: (bool)
+ If True, during the evaluation phase, action are chosen deterministically according to the policy mean
+ and not sampled from the policy distribution.
+ """
+ def__init__(self):
+ super().__init__()
+ self.num_steps_between_copying_online_weights_to_target=EnvironmentSteps(1)
+ self.rate_for_copying_weights_to_target=0.005
+ self.use_deterministic_for_evaluation=True# evaluate agent using deterministic policy (i.e. take the mean value)
+
+
+classSoftActorCriticAgentParameters(AgentParameters):
+ def__init__(self):
+ super().__init__(algorithm=SoftActorCriticAlgorithmParameters(),
+ exploration=AdditiveNoiseParameters(),
+ memory=ExperienceReplayParameters(),# SAC doesnt use episodic related data
+ # network wrappers:
+ networks=OrderedDict([("policy",SACPolicyNetworkParameters()),
+ ("q",SACCriticNetworkParameters()),
+ ("v",SACValueNetworkParameters())]))
+
+ @property
+ defpath(self):
+ return'rl_coach.agents.soft_actor_critic_agent:SoftActorCriticAgent'
+
+
+# Soft Actor Critic - https://arxiv.org/abs/1801.01290
+classSoftActorCriticAgent(PolicyOptimizationAgent):
+ def__init__(self,agent_parameters,parent:Union['LevelManager','CompositeAgent']=None):
+ super().__init__(agent_parameters,parent)
+ self.last_gradient_update_step_idx=0
+
+ # register signals to track (in learn_from_batch)
+ self.policy_means=self.register_signal('Policy_mu_avg')
+ self.policy_logsig=self.register_signal('Policy_logsig')
+ self.policy_logprob_sampled=self.register_signal('Policy_logp_sampled')
+ self.policy_grads=self.register_signal('Policy_grads_sumabs')
+
+ self.q1_values=self.register_signal("Q1")
+ self.TD_err1=self.register_signal("TD err1")
+ self.q2_values=self.register_signal("Q2")
+ self.TD_err2=self.register_signal("TD err2")
+ self.v_tgt_ns=self.register_signal('V_tgt_ns')
+ self.v_onl_ys=self.register_signal('V_onl_ys')
+ self.action_signal=self.register_signal("actions")
+
+ deflearn_from_batch(self,batch):
+ #########################################
+ # need to update the following networks:
+ # 1. actor (policy)
+ # 2. state value (v)
+ # 3. critic (q1 and q2)
+ # 4. target network - probably already handled by V
+
+ #########################################
+ # define the networks to be used
+
+ # State Value Network
+ value_network=self.networks['v']
+ value_network_keys=self.ap.network_wrappers['v'].input_embedders_parameters.keys()
+
+ # Critic Network
+ q_network=self.networks['q'].online_network
+ q_head=q_network.output_heads[0]
+ q_network_keys=self.ap.network_wrappers['q'].input_embedders_parameters.keys()
+
+ # Actor (policy) Network
+ policy_network=self.networks['policy'].online_network
+ policy_network_keys=self.ap.network_wrappers['policy'].input_embedders_parameters.keys()
+
+ ##########################################
+ # 1. updating the actor - according to (13) in the paper
+ policy_inputs=copy.copy(batch.states(policy_network_keys))
+ policy_results=policy_network.predict(policy_inputs)
+
+ policy_mu,policy_std,sampled_raw_actions,sampled_actions,sampled_actions_logprob, \
+ sampled_actions_logprob_mean=policy_results
+
+ self.policy_means.add_sample(policy_mu)
+ self.policy_logsig.add_sample(policy_std)
+ self.policy_logprob_sampled.add_sample(sampled_actions_logprob_mean)
+
+ # get the state-action values for the replayed states and their corresponding actions from the policy
+ q_inputs=copy.copy(batch.states(q_network_keys))
+ q_inputs['output_0_0']=sampled_actions
+ log_target=q_network.predict(q_inputs)[0].squeeze()
+
+ # log internal q values
+ q1_vals,q2_vals=q_network.predict(q_inputs,outputs=[q_head.q1_output,q_head.q2_output])
+ self.q1_values.add_sample(q1_vals)
+ self.q2_values.add_sample(q2_vals)
+
+ # calculate the gradients according to (13)
+ # get the gradients of log_prob w.r.t the weights (parameters) - indicated as phi in the paper
+ initial_feed_dict={policy_network.gradients_weights_ph[5]:np.array(1.0)}
+ dlogp_dphi=policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[5],
+ initial_feed_dict=initial_feed_dict)
+
+ # calculate dq_da
+ dq_da=q_network.predict(q_inputs,
+ outputs=q_network.gradients_wrt_inputs[1]['output_0_0'])
+
+ # calculate da_dphi
+ initial_feed_dict={policy_network.gradients_weights_ph[3]:dq_da}
+ dq_dphi=policy_network.predict(policy_inputs,
+ outputs=policy_network.weighted_gradients[3],
+ initial_feed_dict=initial_feed_dict)
+
+ # now given dlogp_dphi, dq_dphi we need to calculate the policy gradients according to (13)
+ policy_grads=[dlogp_dphi[l]-dq_dphi[l]forlinrange(len(dlogp_dphi))]
+
+ # apply the gradients to policy networks
+ policy_network.apply_gradients(policy_grads)
+ grads_sumabs=np.sum([np.sum(np.abs(policy_grads[l]))forlinrange(len(policy_grads))])
+ self.policy_grads.add_sample(grads_sumabs)
+
+ ##########################################
+ # 2. updating the state value online network weights
+ # done by calculating the targets for the v head according to (5) in the paper
+ # value_targets = log_targets-sampled_actions_logprob
+ value_inputs=copy.copy(batch.states(value_network_keys))
+ value_targets=log_target-sampled_actions_logprob
+
+ self.v_onl_ys.add_sample(value_targets)
+
+ # call value_network apply gradients with this target
+ value_loss=value_network.online_network.train_on_batch(value_inputs,value_targets[:,None])[0]
+
+ ##########################################
+ # 3. updating the critic (q networks)
+ # updating q networks according to (7) in the paper
+
+ # define the input to the q network: state has been already updated previously, but now we need
+ # the actions from the batch (and not those sampled by the policy)
+ q_inputs['output_0_0']=batch.actions(len(batch.actions().shape)==1)
+
+ # define the targets : scale_reward * reward + (1-terminal)*discount*v_target_next_state
+ # define v_target_next_state
+ value_inputs=copy.copy(batch.next_states(value_network_keys))
+ v_target_next_state=value_network.target_network.predict(value_inputs)
+ self.v_tgt_ns.add_sample(v_target_next_state)
+ # Note: reward is assumed to be rescaled by RewardRescaleFilter in the preset parameters
+ TD_targets=batch.rewards(expand_dims=True)+ \
+ (1.0-batch.game_overs(expand_dims=True))*self.ap.algorithm.discount*v_target_next_state
+
+ # call critic network update
+ result=q_network.train_on_batch(q_inputs,TD_targets,additional_fetches=[q_head.q1_loss,q_head.q2_loss])
+ total_loss,losses,unclipped_grads=result[:3]
+ q1_loss,q2_loss=result[3]
+ self.TD_err1.add_sample(q1_loss)
+ self.TD_err2.add_sample(q2_loss)
+
+ ##########################################
+ # 4. updating the value target network
+ # I just need to set the parameter rate_for_copying_weights_to_target in the agent parameters to be 1-tau
+ # where tau is the hyper parameter as defined in sac original implementation
+
+ returntotal_loss,losses,unclipped_grads
+
+ defget_prediction(self,states):
+ """
+ get the mean and stdev of the policy distribution given 'states'
+ :param states: the states for which we need to sample actions from the policy
+ :return: mean and stdev
+ """
+ tf_input_state=self.prepare_batch_for_inference(states,'policy')
+ returnself.networks['policy'].online_network.predict(tf_input_state)
+
+ deftrain(self):
+ # since the algorithm works with experience replay buffer (non-episodic),
+ # we cant use the policy optimization train method. we need Agent.train
+ # note that since in Agent.train there is no apply_gradients, we need to do it in learn from batch
+ returnAgent.train(self)
+
+ defchoose_action(self,curr_state):
+ """
+ choose_action - chooses the most likely action
+ if 'deterministic' - take the mean of the policy which is the prediction of the policy network.
+ else - use the exploration policy
+ :param curr_state:
+ :return: action wrapped in ActionInfo
+ """
+ ifnotisinstance(self.spaces.action,BoxActionSpace):
+ raiseValueError("SAC works only for continuous control problems")
+ # convert to batch so we can run it through the network
+ tf_input_state=self.prepare_batch_for_inference(curr_state,'policy')
+ # use the online network for prediction
+ policy_network=self.networks['policy'].online_network
+ policy_head=policy_network.output_heads[0]
+ result=policy_network.predict(tf_input_state,
+ outputs=[policy_head.policy_mean,policy_head.actions])
+ action_mean,action_sample=result
+
+ # if using deterministic policy, take the mean values. else, use exploration policy to sample from the pdf
+ ifself.phase==RunPhase.TESTandself.ap.algorithm.use_deterministic_for_evaluation:
+ action=action_mean[0]
+ else:
+ action=action_sample[0]
+
+ self.action_signal.add_sample(action)
+
+ action_info=ActionInfo(action=action)
+ returnaction_info
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/_modules/rl_coach/agents/value_optimization_agent.html b/docs/_modules/rl_coach/agents/value_optimization_agent.html
index 02aacb4..730229c 100644
--- a/docs/_modules/rl_coach/agents/value_optimization_agent.html
+++ b/docs/_modules/rl_coach/agents/value_optimization_agent.html
@@ -193,16 +193,17 @@
# See the License for the specific language governing permissions and# limitations under the License.#
-
+fromcollectionsimportOrderedDictfromtypingimportUnionimportnumpyasnpfromrl_coach.agents.agentimportAgent
-fromrl_coach.core_typesimportActionInfo,StateType
+fromrl_coach.core_typesimportActionInfo,StateType,Batch
+fromrl_coach.loggerimportscreenfromrl_coach.memories.non_episodic.prioritized_experience_replayimportPrioritizedExperienceReplayfromrl_coach.spacesimportDiscreteActionSpace
-
+fromcopyimportdeepcopy## This is an abstract agent - there is no learn_from_batch method ##
@@ -229,8 +230,9 @@
actions_q_values=Nonereturnactions_q_values
- defget_prediction(self,states):
- returnself.networks['main'].online_network.predict(self.prepare_batch_for_inference(states,'main'))
+ defget_prediction(self,states,outputs=None):
+ returnself.networks['main'].online_network.predict(self.prepare_batch_for_inference(states,'main'),
+ outputs=outputs)defupdate_transition_priorities_and_get_weights(self,TD_errors,batch):# update errors in prioritized replay buffer
@@ -259,10 +261,12 @@
# this is for bootstrapped dqniftype(actions_q_values)==listandlen(actions_q_values)>0:actions_q_values=self.exploration_policy.last_action_values
- actions_q_values=actions_q_values.squeeze()# store the q values statistics for loggingself.q_values.add_sample(actions_q_values)
+
+ actions_q_values=actions_q_values.squeeze()
+
fori,q_valueinenumerate(actions_q_values):self.q_value_for_action[i].add_sample(q_value)
@@ -276,6 +280,77 @@
deflearn_from_batch(self,batch):raiseNotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")
+
+ defrun_off_policy_evaluation(self):
+ """
+ Run the off-policy evaluation estimators to get a prediction for the performance of the current policy based on
+ an evaluation dataset, which was collected by another policy(ies).
+ :return: None
+ """
+ assertself.ope_manager
+ dataset_as_episodes=self.call_memory('get_all_complete_episodes_from_to',
+ (self.call_memory('get_last_training_set_episode_id')+1,
+ self.call_memory('num_complete_episodes')))
+ iflen(dataset_as_episodes)==0:
+ raiseValueError('train_to_eval_ratio is too high causing the evaluation set to be empty. '
+ 'Consider decreasing its value.')
+
+ ips,dm,dr,seq_dr=self.ope_manager.evaluate(
+ dataset_as_episodes=dataset_as_episodes,
+ batch_size=self.ap.network_wrappers['main'].batch_size,
+ discount_factor=self.ap.algorithm.discount,
+ reward_model=self.networks['reward_model'].online_network,
+ q_network=self.networks['main'].online_network,
+ network_keys=list(self.ap.network_wrappers['main'].input_embedders_parameters.keys()))
+
+ # get the estimators out to the screen
+ log=OrderedDict()
+ log['Epoch']=self.training_epoch
+ log['IPS']=ips
+ log['DM']=dm
+ log['DR']=dr
+ log['Sequential-DR']=seq_dr
+ screen.log_dict(log,prefix='Off-Policy Evaluation')
+
+ # get the estimators out to dashboard
+ self.agent_logger.set_current_time(self.get_current_time()+1)
+ self.agent_logger.create_signal_value('Inverse Propensity Score',ips)
+ self.agent_logger.create_signal_value('Direct Method Reward',dm)
+ self.agent_logger.create_signal_value('Doubly Robust',dr)
+ self.agent_logger.create_signal_value('Sequential Doubly Robust',seq_dr)
+
+ defget_reward_model_loss(self,batch:Batch):
+ network_keys=self.ap.network_wrappers['reward_model'].input_embedders_parameters.keys()
+ current_rewards_prediction_for_all_actions=self.networks['reward_model'].online_network.predict(
+ batch.states(network_keys))
+ current_rewards_prediction_for_all_actions[range(batch.size),batch.actions()]=batch.rewards()
+
+ returnself.networks['reward_model'].train_and_sync_networks(
+ batch.states(network_keys),current_rewards_prediction_for_all_actions)[0]
+
+ defimprove_reward_model(self,epochs:int):
+ """
+ Train a reward model to be used by the doubly-robust estimator
+
+ :param epochs: The total number of epochs to use for training a reward model
+ :return: None
+ """
+ batch_size=self.ap.network_wrappers['reward_model'].batch_size
+
+ # this is fitted from the training dataset
+ forepochinrange(epochs):
+ loss=0
+ total_transitions_processed=0
+ fori,batchinenumerate(self.call_memory('get_shuffled_data_generator',batch_size)):
+ batch=Batch(batch)
+ loss+=self.get_reward_model_loss(batch)
+ total_transitions_processed+=batch.size
+
+ log=OrderedDict()
+ log['Epoch']=epoch
+ log['loss']=loss/total_transitions_processed
+ screen.log_dict(log,prefix='Training Reward Model')
+
diff --git a/docs/_modules/rl_coach/base_parameters.html b/docs/_modules/rl_coach/base_parameters.html
index 764fd8f..7f356db 100644
--- a/docs/_modules/rl_coach/base_parameters.html
+++ b/docs/_modules/rl_coach/base_parameters.html
@@ -205,6 +205,7 @@
fromrl_coach.core_typesimportTrainingSteps,EnvironmentSteps,GradientClippingMethod,RunPhase, \
SelectedPhaseOnlyDumpFilter,MaxDumpFilterfromrl_coach.filters.filterimportNoInputFilter
+fromrl_coach.loggerimportscreenclassFrameworks(Enum):
@@ -379,9 +380,6 @@
# distributed agents paramsself.share_statistics_between_workers=True
- # intrinsic reward
- self.scale_external_reward_by_intrinsic_reward_value=False
-
# n-step returnsself.n_step=-1# calculate the total return (no bootstrap, by default)
@@ -470,7 +468,8 @@
batch_size=32,replace_mse_with_huber_loss=False,create_target_network=False,
- tensorflow_support=True):
+ tensorflow_support=True,
+ softmax_temperature=1):""" :param force_cpu: Force the neural networks to run on the CPU even if a GPU is available
@@ -553,6 +552,8 @@
online network at will. :param tensorflow_support: A flag which specifies if the network is supported by the TensorFlow framework.
+ :param softmax_temperature:
+ If a softmax is present in the network head output, use this temperature """super().__init__()self.framework=Frameworks.tensorflow
@@ -583,16 +584,19 @@
self.heads_parameters=heads_parametersself.use_separate_networks_per_head=use_separate_networks_per_headself.optimizer_type=optimizer_type
+ self.replace_mse_with_huber_loss=replace_mse_with_huber_loss
+ self.create_target_network=create_target_network
+
+ # Framework support
+ self.tensorflow_support=tensorflow_support
+
+ # Hyper-Parameter valuesself.optimizer_epsilon=optimizer_epsilonself.adam_optimizer_beta1=adam_optimizer_beta1self.adam_optimizer_beta2=adam_optimizer_beta2self.rms_prop_optimizer_decay=rms_prop_optimizer_decayself.batch_size=batch_size
- self.replace_mse_with_huber_loss=replace_mse_with_huber_loss
- self.create_target_network=create_target_network
-
- # Framework support
- self.tensorflow_support=tensorflow_support
+ self.softmax_temperature=softmax_temperatureclassNetworkComponentParameters(Parameters):
@@ -723,6 +727,7 @@
self.is_a_highest_level_agent=Trueself.is_a_lowest_level_agent=Trueself.task_parameters=None
+ self.is_batch_rl_training=False@propertydefpath(self):
@@ -730,18 +735,22 @@
[docs]classTaskParameters(Parameters):
- def__init__(self,framework_type:Frameworks=Frameworks.tensorflow,evaluate_only:bool=False,use_cpu:bool=False,
+ def__init__(self,framework_type:Frameworks=Frameworks.tensorflow,evaluate_only:int=None,use_cpu:bool=False,experiment_path='/tmp',seed=None,checkpoint_save_secs=None,checkpoint_restore_dir=None,
- checkpoint_save_dir=None,export_onnx_graph:bool=False,apply_stop_condition:bool=False,
- num_gpu:int=1):
+ checkpoint_restore_path=None,checkpoint_save_dir=None,export_onnx_graph:bool=False,
+ apply_stop_condition:bool=False,num_gpu:int=1):""" :param framework_type: deep learning framework type. currently only tensorflow is supported
- :param evaluate_only: the task will be used only for evaluating the model
+ :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+ A value of 0 means that task will be evaluated for an infinite number of steps. :param use_cpu: use the cpu for this task :param experiment_path: the path to the directory which will store all the experiment outputs :param seed: a seed to use for the random numbers generator :param checkpoint_save_secs: the number of seconds between each checkpoint saving
- :param checkpoint_restore_dir: the directory to restore the checkpoints from
+ :param checkpoint_restore_dir:
+ [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
+ the dir to restore the checkpoints from
+ :param checkpoint_restore_path: the path to restore the checkpoints from :param checkpoint_save_dir: the directory to store the checkpoints in :param export_onnx_graph: If set to True, this will export an onnx graph each time a checkpoint is saved :param apply_stop_condition: If set to True, this will apply the stop condition defined by reaching a target success rate
@@ -753,7 +762,13 @@
self.use_cpu=use_cpuself.experiment_path=experiment_pathself.checkpoint_save_secs=checkpoint_save_secs
- self.checkpoint_restore_dir=checkpoint_restore_dir
+ ifcheckpoint_restore_dir:
+ screen.warning('TaskParameters.checkpoint_restore_dir is DEPECRATED and will be removed in one of the next '
+ 'releases. Please switch to using TaskParameters.checkpoint_restore_path, with your '
+ 'directory path. ')
+ self.checkpoint_restore_path=checkpoint_restore_dir
+ else:
+ self.checkpoint_restore_path=checkpoint_restore_pathself.checkpoint_save_dir=checkpoint_save_dirself.seed=seedself.export_onnx_graph=export_onnx_graph
@@ -763,13 +778,14 @@
[docs]classDistributedTaskParameters(TaskParameters):def__init__(self,framework_type:Frameworks,parameters_server_hosts:str,worker_hosts:str,job_type:str,
- task_index:int,evaluate_only:bool=False,num_tasks:int=None,
+ task_index:int,evaluate_only:int=None,num_tasks:int=None,num_training_tasks:int=None,use_cpu:bool=False,experiment_path=None,dnd=None,
- shared_memory_scratchpad=None,seed=None,checkpoint_save_secs=None,checkpoint_restore_dir=None,
+ shared_memory_scratchpad=None,seed=None,checkpoint_save_secs=None,checkpoint_restore_path=None,checkpoint_save_dir=None,export_onnx_graph:bool=False,apply_stop_condition:bool=False):""" :param framework_type: deep learning framework type. currently only tensorflow is supported
- :param evaluate_only: the task will be used only for evaluating the model
+ :param evaluate_only: if not None, the task will be used only for evaluating the model for the given number of steps.
+ A value of 0 means that task will be evaluated for an infinite number of steps. :param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are assigned :param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
@@ -782,7 +798,7 @@
:param dnd: an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad. :param seed: a seed to use for the random numbers generator :param checkpoint_save_secs: the number of seconds between each checkpoint saving
- :param checkpoint_restore_dir: the directory to restore the checkpoints from
+ :param checkpoint_restore_path: the path to restore the checkpoints from :param checkpoint_save_dir: the directory to store the checkpoints in :param export_onnx_graph: If set to True, this will export an onnx graph each time a checkpoint is saved :param apply_stop_condition: If set to True, this will apply the stop condition defined by reaching a target success rate
@@ -790,7 +806,7 @@
"""super().__init__(framework_type=framework_type,evaluate_only=evaluate_only,use_cpu=use_cpu,experiment_path=experiment_path,seed=seed,checkpoint_save_secs=checkpoint_save_secs,
- checkpoint_restore_dir=checkpoint_restore_dir,checkpoint_save_dir=checkpoint_save_dir,
+ checkpoint_restore_path=checkpoint_restore_path,checkpoint_save_dir=checkpoint_save_dir,export_onnx_graph=export_onnx_graph,apply_stop_condition=apply_stop_condition)self.parameters_server_hosts=parameters_server_hostsself.worker_hosts=worker_hosts
diff --git a/docs/_modules/rl_coach/core_types.html b/docs/_modules/rl_coach/core_types.html
index ddab0b8..8d4c633 100644
--- a/docs/_modules/rl_coach/core_types.html
+++ b/docs/_modules/rl_coach/core_types.html
@@ -193,9 +193,10 @@
# See the License for the specific language governing permissions and# limitations under the License.#
-
+fromcollectionsimportnamedtupleimportcopy
+importmathfromenumimportEnumfromrandomimportshufflefromtypingimportList,Union,Dict,Any,Type
@@ -218,6 +219,17 @@
Measurements=4
+Record=namedtuple('Record',['name','label'])
+
+
+classTimeTypes(Enum):
+ EpisodeNumber=Record(name='Episode #',label='Episode #')
+ TrainingIteration=Record(name='Training Iter',label='Training Iteration')
+ EnvironmentSteps=Record(name='Total steps',label='Total steps (per worker)')
+ WallClockTime=Record(name='Wall-Clock Time',label='Wall-Clock Time (minutes)')
+ Epoch=Record(name='Epoch',label='Epoch #')
+
+
# step methodsclassStepMethod(object):
@@ -232,6 +244,37 @@
defnum_steps(self,val:int)->None:self._num_steps=val
+ def__eq__(self,other):
+ returnself.num_steps==other.num_steps
+
+ def__truediv__(self,other):
+ """
+ divide this step method with other. If other is an integer, returns an object of the same
+ type as self. If other is the same type of self, returns an integer. In either case, any
+ floating point value is rounded up under the assumption that if we are dividing Steps, we
+ would rather overestimate than underestimate.
+ """
+ ifisinstance(other,type(self)):
+ returnmath.ceil(self.num_steps/other.num_steps)
+ elifisinstance(other,int):
+ returntype(self)(math.ceil(self.num_steps/other))
+ else:
+ raiseTypeError("cannot divide {} by {}".format(type(self),type(other)))
+
+ def__rtruediv__(self,other):
+ """
+ divide this step method with other. If other is an integer, returns an object of the same
+ type as self. If other is the same type of self, returns an integer. In either case, any
+ floating point value is rounded up under the assumption that if we are dividing Steps, we
+ would rather overestimate than underestimate.
+ """
+ ifisinstance(other,type(self)):
+ returnmath.ceil(other.num_steps/self.num_steps)
+ elifisinstance(other,int):
+ returntype(self)(math.ceil(other/self.num_steps))
+ else:
+ raiseTypeError("cannot divide {} by {}".format(type(other),type(self)))
+
classFrames(StepMethod):def__init__(self,num_steps):
@@ -429,6 +472,9 @@
.format(self.info.keys(),new_info.keys()))self.info.update(new_info)
+ defupdate_info(self,new_info:Dict[str,Any])->None:
+ self.info.update(new_info)
+
def__copy__(self):new_transition=type(self)()new_transition.__dict__.update(self.__dict__)
@@ -510,8 +556,7 @@
"""def__init__(self,action:ActionType,all_action_probabilities:float=0,
- action_value:float=0.,state_value:float=0.,max_action_value:float=None,
- action_intrinsic_reward:float=0):
+ action_value:float=0.,state_value:float=0.,max_action_value:float=None):""" :param action: the action :param all_action_probabilities: the probability that the action was given when selecting it
@@ -520,8 +565,6 @@
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action that received the maximum value. if no value is given, the action is assumed to be the action with the maximum value
- :param action_intrinsic_reward: can contain any intrinsic reward that the agent wants to add to this action
- selection """self.action=actionself.all_action_probabilities=all_action_probabilities
@@ -530,8 +573,7 @@
ifnotmax_action_value:self.max_action_value=action_valueelse:
- self.max_action_value=max_action_value
- self.action_intrinsic_reward=action_intrinsic_reward
+ self.max_action_value=max_action_value
[docs]classBatch(object):
@@ -1047,6 +1089,19 @@
returnTrueelse:returnFalse
+
+
+# TODO move to a NamedTuple, once we move to Python3.6
+# https://stackoverflow.com/questions/34269772/type-hints-in-namedtuple/34269877
+classCsvDataset(object):
+ def__init__(self,filepath:str,is_episodic:bool=True):
+ self.filepath=filepath
+ self.is_episodic=is_episodic
+
+
+classPickledReplayBuffer(object):
+ def__init__(self,filepath:str):
+ self.filepath=filepath
diff --git a/docs/_modules/rl_coach/data_stores/nfs_data_store.html b/docs/_modules/rl_coach/data_stores/nfs_data_store.html
index 895aa21..39a2769 100644
--- a/docs/_modules/rl_coach/data_stores/nfs_data_store.html
+++ b/docs/_modules/rl_coach/data_stores/nfs_data_store.html
@@ -463,7 +463,12 @@
print("Got exception: %s\n while deleting NFS PVC",e)returnFalse
- returnTrue
+ returnTrue
+
+ defsetup_checkpoint_dir(self,crd=None):
+ ifcrd:
+ # TODO: find a way to upload this to the deployed nfs store.
+ pass
diff --git a/docs/_modules/rl_coach/data_stores/s3_data_store.html b/docs/_modules/rl_coach/data_stores/s3_data_store.html
index 287df0f..e766a50 100644
--- a/docs/_modules/rl_coach/data_stores/s3_data_store.html
+++ b/docs/_modules/rl_coach/data_stores/s3_data_store.html
@@ -257,6 +257,9 @@
returnTruedefsave_to_store(self):
+ self._save_to_store(self.params.checkpoint_dir)
+
+ def_save_to_store(self,checkpoint_dir):""" save_to_store() uploads the policy checkpoint, gifs and videos to the S3 data store. It reads the checkpoint state files and uploads only the latest checkpoint files to S3. It is used by the trainer in Coach when used in the distributed mode.
@@ -268,24 +271,32 @@
# Acquire lockself.mc.put_object(self.params.bucket_name,SyncFiles.LOCKFILE.value,io.BytesIO(b''),0)
- state_file=CheckpointStateFile(os.path.abspath(self.params.checkpoint_dir))
+ state_file=CheckpointStateFile(os.path.abspath(checkpoint_dir))ifstate_file.exists():ckpt_state=state_file.read()checkpoint_file=None
- forroot,dirs,filesinos.walk(self.params.checkpoint_dir):
+ forroot,dirs,filesinos.walk(checkpoint_dir):forfilenameinfiles:iffilename==CheckpointStateFile.checkpoint_state_filename:checkpoint_file=(root,filename)continueiffilename.startswith(ckpt_state.name):abs_name=os.path.abspath(os.path.join(root,filename))
- rel_name=os.path.relpath(abs_name,self.params.checkpoint_dir)
+ rel_name=os.path.relpath(abs_name,checkpoint_dir)self.mc.fput_object(self.params.bucket_name,rel_name,abs_name)abs_name=os.path.abspath(os.path.join(checkpoint_file[0],checkpoint_file[1]))
- rel_name=os.path.relpath(abs_name,self.params.checkpoint_dir)
+ rel_name=os.path.relpath(abs_name,checkpoint_dir)self.mc.fput_object(self.params.bucket_name,rel_name,abs_name)
+ # upload Finished if present
+ ifos.path.exists(os.path.join(checkpoint_dir,SyncFiles.FINISHED.value)):
+ self.mc.put_object(self.params.bucket_name,SyncFiles.FINISHED.value,io.BytesIO(b''),0)
+
+ # upload Ready if present
+ ifos.path.exists(os.path.join(checkpoint_dir,SyncFiles.TRAINER_READY.value)):
+ self.mc.put_object(self.params.bucket_name,SyncFiles.TRAINER_READY.value,io.BytesIO(b''),0)
+
# release lockself.mc.remove_object(self.params.bucket_name,SyncFiles.LOCKFILE.value)
@@ -301,6 +312,7 @@
ifself.params.expt_dirandos.path.exists(os.path.join(self.params.expt_dir,'gifs')):forfilenameinos.listdir(os.path.join(self.params.expt_dir,'gifs')):self.mc.fput_object(self.params.bucket_name,filename,os.path.join(self.params.expt_dir,'gifs',filename))
+
exceptResponseErrorase:print("Got exception: %s\n while saving to S3",e)
@@ -337,6 +349,18 @@
exceptExceptionase:pass
+ # Check if there's a ready file
+ objects=self.mc.list_objects_v2(self.params.bucket_name,SyncFiles.TRAINER_READY.value)
+
+ ifnext(objects,None)isnotNone:
+ try:
+ self.mc.fget_object(
+ self.params.bucket_name,SyncFiles.TRAINER_READY.value,
+ os.path.abspath(os.path.join(self.params.checkpoint_dir,SyncFiles.TRAINER_READY.value))
+ )
+ exceptExceptionase:
+ pass
+
checkpoint_state=state_file.read()ifcheckpoint_stateisnotNone:objects=self.mc.list_objects_v2(self.params.bucket_name,prefix=checkpoint_state.name,recursive=True)
@@ -346,7 +370,11 @@
self.mc.fget_object(obj.bucket_name,obj.object_name,filename)exceptResponseErrorase:
- print("Got exception: %s\n while loading from S3",e)
+ print("Got exception: %s\n while loading from S3",e)
+
+ defsetup_checkpoint_dir(self,crd=None):
+ ifcrd:
+ self._save_to_store(crd)
diff --git a/docs/_modules/rl_coach/environments/gym_environment.html b/docs/_modules/rl_coach/environments/gym_environment.html
index b8ce28c..5bda38e 100644
--- a/docs/_modules/rl_coach/environments/gym_environment.html
+++ b/docs/_modules/rl_coach/environments/gym_environment.html
@@ -480,7 +480,6 @@
random.seed(self.seed)# frame skip and max between consecutive frames
- self.is_robotics_env='robotics'instr(self.env.unwrapped.__class__)self.is_mujoco_env='mujoco'instr(self.env.unwrapped.__class__)self.is_roboschool_env='roboschool'instr(self.env.unwrapped.__class__)self.is_atari_env='Atari'instr(self.env.unwrapped.__class__)
@@ -501,7 +500,7 @@
self.state_space=StateSpace({})# observations
- ifnotisinstance(self.env.observation_space,gym.spaces.dict_space.Dict):
+ ifnotisinstance(self.env.observation_space,gym.spaces.dict.Dict):state_space={'observation':self.env.observation_space}else:state_space=self.env.observation_space.spaces
@@ -665,38 +664,11 @@
# initialize the number of livesself._update_ale_lives()
- def_set_mujoco_camera(self,camera_idx:int):
- """
- This function can be used to set the camera for rendering the mujoco simulator
- :param camera_idx: The index of the camera to use. Should be defined in the model
- :return: None
- """
- ifself.env.unwrapped.viewerisnotNoneandself.env.unwrapped.viewer.cam.fixedcamid!=camera_idx:
- frommujoco_py.generatedimportconst
- self.env.unwrapped.viewer.cam.type=const.CAMERA_FIXED
- self.env.unwrapped.viewer.cam.fixedcamid=camera_idx
-
- def_get_robotics_image(self):
- self.env.render()
- image=self.env.unwrapped._get_viewer().read_pixels(1600,900,depth=False)[::-1,:,:]
- image=scipy.misc.imresize(image,(270,480,3))
- returnimage
-
def_render(self):self.env.render(mode='human')
- # required for setting up a fixed camera for mujoco
- ifself.is_mujoco_envandnotself.is_roboschool_env:
- self._set_mujoco_camera(0)defget_rendered_image(self):
- ifself.is_robotics_env:
- # necessary for fetch since the rendered image is cropped to an irrelevant part of the simulator
- image=self._get_robotics_image()
- else:
- image=self.env.render(mode='rgb_array')
- # required for setting up a fixed camera for mujoco
- ifself.is_mujoco_envandnotself.is_roboschool_env:
- self._set_mujoco_camera(0)
+ image=self.env.render(mode='rgb_array')returnimagedefget_target_success_rate(self)->float:
diff --git a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
index 8fc2325..507e9a0 100644
--- a/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
+++ b/docs/_modules/rl_coach/memories/episodic/episodic_experience_replay.html
@@ -179,6 +179,7 @@
Source code for rl_coach.memories.episodic.episodic_experience_replay
#
+## Copyright (c) 2017 Intel Corporation## Licensed under the Apache License, Version 2.0 (the "License");
@@ -193,14 +194,19 @@
# See the License for the specific language governing permissions and# limitations under the License.#
+importast
+importmath
-fromtypingimportList,Tuple,Union,Dict,Any
-
+importpandasaspd
+fromtypingimportList,Tuple,Unionimportnumpyasnp
+importrandomfromrl_coach.core_typesimportTransition,Episode
+fromrl_coach.loggerimportscreenfromrl_coach.memories.memoryimportMemory,MemoryGranularity,MemoryParameters
-fromrl_coach.utilsimportReaderWriterLock
+fromrl_coach.utilsimportReaderWriterLock,ProgressBar
+fromrl_coach.core_typesimportCsvDatasetclassEpisodicExperienceReplayParameters(MemoryParameters):
@@ -208,6 +214,7 @@
super().__init__()self.max_size=(MemoryGranularity.Transitions,1000000)self.n_step=-1
+ self.train_to_eval_ratio=1# for OPE we'll want a value < 1@propertydefpath(self):
@@ -220,7 +227,9 @@
calculations of total return and other values that depend on the sequential behavior of the transitions in the episode. """
- def__init__(self,max_size:Tuple[MemoryGranularity,int]=(MemoryGranularity.Transitions,1000000),n_step=-1):
+
+ def__init__(self,max_size:Tuple[MemoryGranularity,int]=(MemoryGranularity.Transitions,1000000),n_step=-1,
+ train_to_eval_ratio:int=1):""" :param max_size: the maximum number of transitions or episodes to hold in the memory """
@@ -232,8 +241,11 @@
self._num_transitions=0self._num_transitions_in_complete_episodes=0self.reader_writer_lock=ReaderWriterLock()
+ self.last_training_set_episode_id=None# used in batch-rl
+ self.last_training_set_transition_id=None# used in batch-rl
+ self.train_to_eval_ratio=train_to_eval_ratio# used in batch-rl
- deflength(self,lock:bool=False)->int:
+ deflength(self,lock:bool=False)->int:""" Get the number of episodes in the ER (even if they are not complete) """
@@ -255,6 +267,9 @@
defnum_transitions_in_complete_episodes(self):returnself._num_transitions_in_complete_episodes
+ defget_last_training_set_episode_id(self):
+ returnself.last_training_set_episode_id
+
defsample(self,size:int,is_consecutive_transitions=False)->List[Transition]:""" Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
@@ -272,7 +287,7 @@
batch=self._buffer[episode_idx].transitionselse:transition_idx=np.random.randint(size,self._buffer[episode_idx].length())
- batch=self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]
+ batch=self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]else:transitions_idx=np.random.randint(self.num_transitions_in_complete_episodes(),size=size)batch=[self.transitions[i]foriintransitions_idx]
@@ -285,6 +300,78 @@
returnbatch
+ defget_episode_for_transition(self,transition:Transition)->(int,Episode):
+ """
+ Get the episode from which that transition came from.
+ :param transition: The transition to lookup the episode for
+ :return: (Episode number, the episode) or (-1, None) if could not find a matching episode.
+ """
+
+ fori,episodeinenumerate(self._buffer):
+ iftransitioninepisode.transitions:
+ returni,episode
+ return-1,None
+
+ defshuffle_episodes(self):
+ """
+ Shuffle all the episodes in the replay buffer
+ :return:
+ """
+ random.shuffle(self._buffer)
+ self.transitions=[tforeinself._bufferfortine.transitions]
+
+ defget_shuffled_data_generator(self,size:int)->List[Transition]:
+ """
+ Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+ If the requested size is larger than the number of samples available in the replay buffer then the batch will
+ return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+ transitions in the replay buffer.
+
+ :param size: the size of the batch to return
+ :return: a batch (list) of selected transitions from the replay buffer
+ """
+ self.reader_writer_lock.lock_writing()
+ ifself.last_training_set_transition_idisNone:
+ ifself.train_to_eval_ratio<0orself.train_to_eval_ratio>=1:
+ raiseValueError('train_to_eval_ratio should be in the (0, 1] range.')
+
+ transition=self.transitions[round(self.train_to_eval_ratio*self.num_transitions_in_complete_episodes())]
+ episode_num,episode=self.get_episode_for_transition(transition)
+ self.last_training_set_episode_id=episode_num
+ self.last_training_set_transition_id= \
+ len([tforeinself.get_all_complete_episodes_from_to(0,self.last_training_set_episode_id+1)fortine])
+
+ shuffled_transition_indices=list(range(self.last_training_set_transition_id))
+ random.shuffle(shuffled_transition_indices)
+
+ # The last batch drawn will usually be < batch_size (=the size variable)
+ foriinrange(math.ceil(len(shuffled_transition_indices)/size)):
+ sample_data=[self.transitions[j]forjinshuffled_transition_indices[i*size:(i+1)*size]]
+ self.reader_writer_lock.release_writing()
+
+ yieldsample_data
+
+ defget_all_complete_episodes_transitions(self)->List[Transition]:
+ """
+ Get all the transitions from all the complete episodes in the buffer
+ :return: a list of transitions
+ """
+ returnself.transitions[:self.num_transitions_in_complete_episodes()]
+
+ defget_all_complete_episodes(self)->List[Episode]:
+ """
+ Get all the transitions from all the complete episodes in the buffer
+ :return: a list of transitions
+ """
+ returnself.get_all_complete_episodes_from_to(0,self.num_complete_episodes())
+
+ defget_all_complete_episodes_from_to(self,start_episode_id,end_episode_id)->List[Episode]:
+ """
+ Get all the transitions from all the complete episodes in the buffer matching the given episode range
+ :return: a list of transitions
+ """
+ returnself._buffer[start_episode_id:end_episode_id]
+
def_enforce_max_length(self)->None:""" Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -368,7 +455,7 @@
self.reader_writer_lock.release_writing_and_reading()
- defstore_episode(self,episode:Episode,lock:bool=True)->None:
+ defstore_episode(self,episode:Episode,lock:bool=True)->None:""" Store a new episode in the memory. :param episode: the new episode to store
@@ -391,7 +478,7 @@
iflock:self.reader_writer_lock.release_writing_and_reading()
- defget_episode(self,episode_index:int,lock:bool=True)->Union[None,Episode]:
+ defget_episode(self,episode_index:int,lock:bool=True)->Union[None,Episode]:""" Returns the episode in the given index. If the episode does not exist, returns None instead. :param episode_index: the index of the episode to return
@@ -436,7 +523,7 @@
self.reader_writer_lock.release_writing_and_reading()# for API compatibility
- defget(self,episode_index:int,lock:bool=True)->Union[None,Episode]:
+ defget(self,episode_index:int,lock:bool=True)->Union[None,Episode]:""" Returns the episode in the given index. If the episode does not exist, returns None instead. :param episode_index: the index of the episode to return
@@ -494,7 +581,51 @@
mean=np.mean([transition.rewardfortransitioninself.transitions])self.reader_writer_lock.release_writing()
- returnmean
+ returnmean
+
+ defload_csv(self,csv_dataset:CsvDataset)->None:
+ """
+ Restore the replay buffer contents from a csv file.
+ The csv file is assumed to include a list of transitions.
+ :param csv_dataset: A construct which holds the dataset parameters
+ """
+ df=pd.read_csv(csv_dataset.filepath)
+ iflen(df)>self.max_size[1]:
+ screen.warning("Warning! The number of transitions to load into the replay buffer ({}) is "
+ "bigger than the max size of the replay buffer ({}). The excessive transitions will "
+ "not be stored.".format(len(df),self.max_size[1]))
+
+ episode_ids=df['episode_id'].unique()
+ progress_bar=ProgressBar(len(episode_ids))
+ state_columns=[colforcolindf.columnsifcol.startswith('state_feature')]
+
+ fore_idinepisode_ids:
+ progress_bar.update(e_id)
+ df_episode_transitions=df[df['episode_id']==e_id]
+ episode=Episode()
+ for(_,current_transition),(_,next_transition)inzip(df_episode_transitions[:-1].iterrows(),
+ df_episode_transitions[1:].iterrows()):
+ state=np.array([current_transition[col]forcolinstate_columns])
+ next_state=np.array([next_transition[col]forcolinstate_columns])
+
+ episode.insert(
+ Transition(state={'observation':state},
+ action=current_transition['action'],reward=current_transition['reward'],
+ next_state={'observation':next_state},game_over=False,
+ info={'all_action_probabilities':
+ ast.literal_eval(current_transition['all_action_probabilities'])}))
+
+ # Set the last transition to end the episode
+ ifcsv_dataset.is_episodic:
+ episode.get_last_transition().game_over=True
+
+ self.store_episode(episode)
+
+ # close the progress bar
+ progress_bar.update(len(episode_ids))
+ progress_bar.close()
+
+ self.shuffle_episodes()
diff --git a/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html b/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
index 735d943..6b7c6fc 100644
--- a/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
+++ b/docs/_modules/rl_coach/memories/non_episodic/experience_replay.html
@@ -194,10 +194,10 @@
# limitations under the License.#
-fromtypingimportList,Tuple,Union,Dict,Any
+fromtypingimportList,Tuple,Unionimportpickle
-importsys
-importtime
+importrandom
+importmathimportnumpyasnp
@@ -252,7 +252,6 @@
Sample a batch of transitions form the replay buffer. If the requested size is larger than the number of samples available in the replay buffer then the batch will return empty. :param size: the size of the batch to sample
- :param beta: the beta parameter used for importance sampling :return: a batch (list) of selected transitions from the replay buffer """self.reader_writer_lock.lock_writing()
@@ -272,6 +271,28 @@
self.reader_writer_lock.release_writing()returnbatch
+ defget_shuffled_data_generator(self,size:int)->List[Transition]:
+ """
+ Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
+ If the requested size is larger than the number of samples available in the replay buffer then the batch will
+ return empty. The last returned batch may be smaller than the size requested, to accommodate for all the
+ transitions in the replay buffer.
+
+ :param size: the size of the batch to return
+ :return: a batch (list) of selected transitions from the replay buffer
+ """
+ self.reader_writer_lock.lock_writing()
+ shuffled_transition_indices=list(range(len(self.transitions)))
+ random.shuffle(shuffled_transition_indices)
+
+ # we deliberately drop some of the ending data which is left after dividing to batches of size `size`
+ # for i in range(math.ceil(len(shuffled_transition_indices) / size)):
+ foriinrange(int(len(shuffled_transition_indices)/size)):
+ sample_data=[self.transitions[j]forjinshuffled_transition_indices[i*size:(i+1)*size]]
+ self.reader_writer_lock.release_writing()
+
+ yieldsample_data
+
def_enforce_max_length(self)->None:""" Make sure that the size of the replay buffer does not pass the maximum size allowed.
@@ -395,7 +416,7 @@
withopen(file_path,'wb')asfile:pickle.dump(self.transitions,file)
- defload(self,file_path:str)->None:
+ defload_pickled(self,file_path:str)->None:""" Restore the replay buffer contents from a pickle file. The pickle file is assumed to include a list of transitions.
@@ -418,6 +439,7 @@
progress_bar.update(transition_idx)progress_bar.close()
+
diff --git a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
index 47babe0..166d287 100644
--- a/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
+++ b/docs/_modules/rl_coach/orchestrators/kubernetes_orchestrator.html
@@ -298,7 +298,7 @@
self.s3_access_key=os.environ.get('ACCESS_KEY_ID')self.s3_secret_key=os.environ.get('SECRET_ACCESS_KEY')
- defsetup(self)->bool:
+ defsetup(self,crd=None)->bool:""" Deploys the memory backend and data stores if required. """
@@ -308,6 +308,9 @@
returnFalseifself.params.data_store_params.store_type=="nfs":self.nfs_pvc=self.data_store.get_info()
+
+ # Upload checkpoints in checkpoint_restore_dir (if provided) to the data store
+ self.data_store.setup_checkpoint_dir(crd)returnTruedefdeploy_trainer(self)->bool:
@@ -321,7 +324,6 @@
trainer_params.command+=['--memory_backend_params',json.dumps(self.params.memory_backend_parameters.__dict__)]trainer_params.command+=['--data_store_params',json.dumps(self.params.data_store_params.__dict__)]
-
name="{}-{}".format(trainer_params.run_type,uuid.uuid4())ifself.params.data_store_params.store_type=="nfs":
@@ -346,7 +348,7 @@
name="nfs-pvc",persistent_volume_claim=self.nfs_pvc)],
- restart_policy='OnFailure'
+ restart_policy='Never'),)else:
@@ -365,7 +367,7 @@
metadata=k8sclient.V1ObjectMeta(labels={'app':name}),spec=k8sclient.V1PodSpec(containers=[container],
- restart_policy='OnFailure'
+ restart_policy='Never'),)
@@ -427,7 +429,7 @@
name="nfs-pvc",persistent_volume_claim=self.nfs_pvc)],
- restart_policy='OnFailure'
+ restart_policy='Never'),)else:
@@ -446,7 +448,7 @@
metadata=k8sclient.V1ObjectMeta(labels={'app':name}),spec=k8sclient.V1PodSpec(containers=[container],
- restart_policy='OnFailure'
+ restart_policy='Never'))
@@ -496,7 +498,7 @@
returnforpodinpods.items:
- Process(target=self._tail_log_file,args=(pod.metadata.name,api_client,self.params.namespace,path)).start()
+ Process(target=self._tail_log_file,args=(pod.metadata.name,api_client,self.params.namespace,path),daemon=True).start()def_tail_log_file(self,pod_name,api_client,namespace,path):ifnotos.path.exists(path):
@@ -528,7 +530,7 @@
ifnotpod:return
- self.tail_log(pod.metadata.name,api_client)
+ returnself.tail_log(pod.metadata.name,api_client)deftail_log(self,pod_name,corev1_api):whileTrue:
@@ -562,9 +564,9 @@
container_status.state.waiting.reason=='CrashLoopBackOff'or \
container_status.state.waiting.reason=='ImagePullBackOff'or \
container_status.state.waiting.reason=='ErrImagePull':
- return
+ return1ifcontainer_status.state.terminatedisnotNone:
- return
+ returncontainer_status.state.terminated.exit_codedefundeploy(self):"""
diff --git a/docs/_modules/rl_coach/spaces.html b/docs/_modules/rl_coach/spaces.html
index b38d15b..75d67e3 100644
--- a/docs/_modules/rl_coach/spaces.html
+++ b/docs/_modules/rl_coach/spaces.html
@@ -828,7 +828,7 @@
"""def__init__(self,state:StateSpace,
- goal:ObservationSpace,
+ goal:Union[ObservationSpace,None],action:ActionSpace,reward:RewardSpace):self.state=state
diff --git a/docs/_sources/components/agents/index.rst.txt b/docs/_sources/components/agents/index.rst.txt
index 62aaf0e..476bc7a 100644
--- a/docs/_sources/components/agents/index.rst.txt
+++ b/docs/_sources/components/agents/index.rst.txt
@@ -21,6 +21,7 @@ A detailed description of those algorithms can be found by navigating to each of
imitation/cil
policy_optimization/cppo
policy_optimization/ddpg
+ policy_optimization/sac
other/dfp
value_optimization/double_dqn
value_optimization/dqn
diff --git a/docs/_sources/components/agents/policy_optimization/acer.rst.txt b/docs/_sources/components/agents/policy_optimization/acer.rst.txt
index 7808443..d16a0e7 100644
--- a/docs/_sources/components/agents/policy_optimization/acer.rst.txt
+++ b/docs/_sources/components/agents/policy_optimization/acer.rst.txt
@@ -38,6 +38,7 @@ Each update perform the following procedure:
.. math:: \text{where} \quad \bar{\rho}_{t} = \min{\left\{c,\rho_t\right\}},\quad \rho_t=\frac{\pi (a_t \mid s_t)}{\mu (a_t \mid s_t)}
3. **Accumulate gradients:**
+
:math:`\bullet` **Policy gradients (with bias correction):**
.. math:: \hat{g}_t^{policy} & = & \bar{\rho}_{t} \nabla \log \pi (a_t \mid s_t) [Q^{ret}(s_t,a_t) - V(s_t)] \\
diff --git a/docs/_sources/components/agents/policy_optimization/sac.rst.txt b/docs/_sources/components/agents/policy_optimization/sac.rst.txt
new file mode 100644
index 0000000..418c87e
--- /dev/null
+++ b/docs/_sources/components/agents/policy_optimization/sac.rst.txt
@@ -0,0 +1,49 @@
+Soft Actor-Critic
+============
+
+**Actions space:** Continuous
+
+**References:** `Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor `_
+
+Network Structure
+-----------------
+
+.. image:: /_static/img/design_imgs/sac.png
+ :align: center
+
+Algorithm Description
+---------------------
+
+Choosing an action - Continuous actions
++++++++++++++++++++++++++++++++++++++
+
+The policy network is used in order to predict mean and log std for each action. While training, a sample is taken
+from a Gaussian distribution with these mean and std values. When testing, the agent can choose deterministically
+by picking the mean value or sample from a gaussian distribution like in training.
+
+Training the network
+++++++++++++++++++++
+Start by sampling a batch :math:`B` of transitions from the experience replay.
+
+* To train the **Q network**, use the following targets:
+
+ .. math:: y_t^Q=r(s_t,a_t)+\gamma \cdot V(s_{t+1})
+
+ The state value used in the above target is acquired by running the target state value network.
+
+* To train the **State Value network**, use the following targets:
+
+ .. math:: y_t^V = \min_{i=1,2}Q_i(s_t,\tilde{a}) - log\pi (\tilde{a} \vert s),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)
+
+ The state value network is trained using a sample-based approximation of the connection between and state value and state
+ action values, The actions used for constructing the target are **not** sampled from the replay buffer, but rather sampled
+ from the current policy.
+
+* To train the **actor network**, use the following equation:
+
+ .. math:: \nabla_{\theta} J \approx \nabla_{\theta} \frac{1}{\vert B \vert} \sum_{s_t\in B} \left( Q \left(s_t, \tilde{a}_\theta(s_t)\right) - log\pi_{\theta}(\tilde{a}_{\theta}(s_t)\vert s_t) \right),\,\,\,\, \tilde{a} \sim \pi(\cdot \vert s_t)
+
+After every training step, do a soft update of the V target network's weights from the online networks.
+
+
+.. autoclass:: rl_coach.agents.soft_actor_critic_agent.SoftActorCriticAlgorithmParameters
\ No newline at end of file
diff --git a/docs/_sources/selecting_an_algorithm.rst.txt b/docs/_sources/selecting_an_algorithm.rst.txt
index e56f802..e4027de 100644
--- a/docs/_sources/selecting_an_algorithm.rst.txt
+++ b/docs/_sources/selecting_an_algorithm.rst.txt
@@ -198,6 +198,14 @@ The algorithms are ordered by their release date in descending order.
improve stability it also employs bias correction and trust region optimization techniques.
+
+
+ SAC
+
+ Soft Actor-Critic is an algorithm which optimizes a stochastic policy in an off-policy way.
+ One of the key features of SAC is that it solves a maximum entropy reinforcement learning problem.
+
+
framework_type – deep learning framework type. currently only tensorflow is supported
-
evaluate_only – the task will be used only for evaluating the model
+
evaluate_only – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.
use_cpu – use the cpu for this task
experiment_path – the path to the directory which will store all the experiment outputs
seed – a seed to use for the random numbers generator
checkpoint_save_secs – the number of seconds between each checkpoint saving
-
checkpoint_restore_dir – the directory to restore the checkpoints from
+
checkpoint_restore_dir – [DEPECRATED - will be removed in one of the next releases - switch to checkpoint_restore_path]
+the dir to restore the checkpoints from
+
checkpoint_restore_path – the path to restore the checkpoints from
checkpoint_save_dir – the directory to store the checkpoints in
export_onnx_graph – If set to True, this will export an onnx graph each time a checkpoint is saved
apply_stop_condition – If set to True, this will apply the stop condition defined by reaching a target success rate
framework_type – deep learning framework type. currently only tensorflow is supported
-
evaluate_only – the task will be used only for evaluating the model
+
evaluate_only – if not None, the task will be used only for evaluating the model for the given number of steps.
+A value of 0 means that task will be evaluated for an infinite number of steps.
parameters_server_hosts – comma-separated list of hostname:port pairs to which the parameter servers are
assigned
worker_hosts – comma-separated list of hostname:port pairs to which the workers are assigned
@@ -325,7 +329,7 @@ assigned
dnd – an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.
seed – a seed to use for the random numbers generator
checkpoint_save_secs – the number of seconds between each checkpoint saving
-
checkpoint_restore_dir – the directory to restore the checkpoints from
+
checkpoint_restore_path – the path to restore the checkpoints from
checkpoint_save_dir – the directory to store the checkpoints in
export_onnx_graph – If set to True, this will export an onnx graph each time a checkpoint is saved
apply_stop_condition – If set to True, this will apply the stop condition defined by reaching a target success rate
This emulates the act using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given the agents current knowledge, decide on the next action to apply to the environment
-:return: an action and a dictionary containing any additional info from the action decision process
This emulates the observe using the transition obtained from the rollout worker on the training worker
-in case of distributed training.
-Given a response from the environment, distill the observation from it and store it for later use.
-The response should be a dictionary containing the performed action, the new observation and measurements,
-the reward, a game over flag and any additional information necessary.
-:return:
Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
@@ -632,6 +616,21 @@ by val, and by the current phase set in self.phase.
Run off-policy evaluation estimators to evaluate the trained policy performance against a dataset.
+Should only be implemented for off-policy RL algorithms.