mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
rename AgentInterface.emulate_observe_on_trainer or observe_transition and call from AgentInterface.observe
This commit is contained in:
@@ -900,31 +900,35 @@ class Agent(AgentInterface):
|
|||||||
# make agent specific changes to the transition if needed
|
# make agent specific changes to the transition if needed
|
||||||
transition = self.update_transition_before_adding_to_replay_buffer(transition)
|
transition = self.update_transition_before_adding_to_replay_buffer(transition)
|
||||||
|
|
||||||
# sum up the total shaped reward
|
|
||||||
self.total_shaped_reward_in_current_episode += transition.reward
|
|
||||||
self.total_reward_in_current_episode += env_response.reward
|
|
||||||
self.shaped_reward.add_sample(transition.reward)
|
|
||||||
self.reward.add_sample(env_response.reward)
|
|
||||||
|
|
||||||
# add action info to transition
|
# add action info to transition
|
||||||
if type(self.parent).__name__ == 'CompositeAgent':
|
if type(self.parent).__name__ == 'CompositeAgent':
|
||||||
transition.add_info(self.parent.last_action_info.__dict__)
|
transition.add_info(self.parent.last_action_info.__dict__)
|
||||||
else:
|
else:
|
||||||
transition.add_info(self.last_action_info.__dict__)
|
transition.add_info(self.last_action_info.__dict__)
|
||||||
|
|
||||||
# create and store the transition
|
self.total_reward_in_current_episode += env_response.reward
|
||||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
|
self.reward.add_sample(env_response.reward)
|
||||||
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
|
|
||||||
# for regular memories we insert the transitions directly to the memory
|
|
||||||
self.current_episode_buffer.insert(transition)
|
|
||||||
if not isinstance(self.memory, EpisodicExperienceReplay) \
|
|
||||||
and not self.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
|
|
||||||
self.call_memory('store', transition)
|
|
||||||
|
|
||||||
if self.ap.visualization.dump_in_episode_signals:
|
return self.observe_transition(transition)
|
||||||
self.update_step_in_episode_log()
|
|
||||||
|
|
||||||
return transition.game_over
|
def observe_transition(self, transition):
|
||||||
|
# sum up the total shaped reward
|
||||||
|
self.total_shaped_reward_in_current_episode += transition.reward
|
||||||
|
self.shaped_reward.add_sample(transition.reward)
|
||||||
|
|
||||||
|
# create and store the transition
|
||||||
|
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
|
||||||
|
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
|
||||||
|
# for regular memories we insert the transitions directly to the memory
|
||||||
|
self.current_episode_buffer.insert(transition)
|
||||||
|
if not isinstance(self.memory, EpisodicExperienceReplay) \
|
||||||
|
and not self.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
|
||||||
|
self.call_memory('store', transition)
|
||||||
|
|
||||||
|
if self.ap.visualization.dump_in_episode_signals:
|
||||||
|
self.update_step_in_episode_log()
|
||||||
|
|
||||||
|
return transition.game_over
|
||||||
|
|
||||||
def post_training_commands(self) -> None:
|
def post_training_commands(self) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -1009,38 +1013,6 @@ class Agent(AgentInterface):
|
|||||||
for network in self.networks.values():
|
for network in self.networks.values():
|
||||||
network.sync()
|
network.sync()
|
||||||
|
|
||||||
# TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
|
|
||||||
# an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
|
|
||||||
def emulate_observe_on_trainer(self, transition: Transition) -> bool:
|
|
||||||
"""
|
|
||||||
This emulates the observe using the transition obtained from the rollout worker on the training worker
|
|
||||||
in case of distributed training.
|
|
||||||
Given a response from the environment, distill the observation from it and store it for later use.
|
|
||||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
|
||||||
the reward, a game over flag and any additional information necessary.
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
# sum up the total shaped reward
|
|
||||||
self.total_shaped_reward_in_current_episode += transition.reward
|
|
||||||
self.total_reward_in_current_episode += transition.reward
|
|
||||||
self.shaped_reward.add_sample(transition.reward)
|
|
||||||
self.reward.add_sample(transition.reward)
|
|
||||||
|
|
||||||
# create and store the transition
|
|
||||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
|
|
||||||
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
|
|
||||||
# for regular memories we insert the transitions directly to the memory
|
|
||||||
self.current_episode_buffer.insert(transition)
|
|
||||||
if not isinstance(self.memory, EpisodicExperienceReplay) \
|
|
||||||
and not self.ap.algorithm.store_transitions_only_when_episodes_are_terminated:
|
|
||||||
self.call_memory('store', transition)
|
|
||||||
|
|
||||||
if self.ap.visualization.dump_in_episode_signals:
|
|
||||||
self.update_step_in_episode_log()
|
|
||||||
|
|
||||||
return transition.game_over
|
|
||||||
|
|
||||||
def get_success_rate(self) -> float:
|
def get_success_rate(self) -> float:
|
||||||
return self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
|
return self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
|
||||||
|
|
||||||
|
|||||||
@@ -125,23 +125,6 @@ class AgentInterface(object):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError("")
|
raise NotImplementedError("")
|
||||||
|
|
||||||
# TODO-remove - this is a temporary flow, used by the trainer worker, duplicated from observe() - need to create
|
|
||||||
# an external trainer flow reusing the existing flow and methods [e.g. observe(), step(), act()]
|
|
||||||
def emulate_observe_on_trainer(self, transition: Transition) -> bool:
|
|
||||||
"""
|
|
||||||
This emulates the act using the transition obtained from the rollout worker on the training worker
|
|
||||||
in case of distributed training.
|
|
||||||
Gets a response from the environment.
|
|
||||||
Processes this information for later use. For example, create a transition and store it in memory.
|
|
||||||
The action info (a class containing any info the agent wants to store regarding its action decision process) is
|
|
||||||
stored by the agent itself when deciding on the action.
|
|
||||||
:param env_response: a EnvResponse containing the response from the environment
|
|
||||||
:return: a done signal which is based on the agent knowledge. This can be different from the done signal from
|
|
||||||
the environment. For example, an agent can decide to finish the episode each time it gets some
|
|
||||||
intrinsic reward
|
|
||||||
"""
|
|
||||||
raise NotImplementedError("")
|
|
||||||
|
|
||||||
def collect_savers(self, parent_path_suffix: str) -> SaverCollection:
|
def collect_savers(self, parent_path_suffix: str) -> SaverCollection:
|
||||||
"""
|
"""
|
||||||
Collect all of agent savers
|
Collect all of agent savers
|
||||||
|
|||||||
@@ -312,7 +312,7 @@ class LevelManager(EnvironmentInterface):
|
|||||||
|
|
||||||
# for i in range(self.steps_limit.num_steps):
|
# for i in range(self.steps_limit.num_steps):
|
||||||
# let the agent observe the result and decide if it wants to terminate the episode
|
# let the agent observe the result and decide if it wants to terminate the episode
|
||||||
done = acting_agent.emulate_observe_on_trainer(transition)
|
done = acting_agent.observe_transition(transition)
|
||||||
acting_agent.act(transition.action)
|
acting_agent.act(transition.action)
|
||||||
|
|
||||||
if done:
|
if done:
|
||||||
|
|||||||
Reference in New Issue
Block a user