Itaicaspi/episode reset refactoring (#105)

* reordering of the episode reset operation and allowing to store episodes only when they are terminated * reordering of the episode reset operation and allowing to store episodes only when they are terminated * revert tensorflow-gpu to 1.9.0 + bug fix in should_train() * tests readme file and refactoring of policy optimization agent train function * Update README.md * Update README.md * additional policy optimization train function simplifications * Updated the traces after the reordering of the environment reset * docker and jenkins files * updated the traces to the ones from within the docker container * updated traces and added control suite to the docker * updated jenkins file with the intel proxy + updated doom basic a3c test params * updated line breaks in jenkins file * added a missing line break in jenkins file * refining trace tests ignored presets + adding a configurable beta entropy value * switch the order of trace and golden tests in jenkins + fix golden tests processes not killed issue * updated benchmarks for dueling ddqn breakout and pong * allowing dynamic updates to the loss weights + bug fix in episode.update_returns * remove docker and jenkins file
2026-02-15 21:45:46 +01:00 · 2018-09-04 15:07:54 +03:00
parent 7086492127
commit 72a1d9d426
92 changed files with 9803 additions and 9740 deletions
--- a/rl_coach/level_manager.py
+++ b/rl_coach/level_manager.py
@@ -70,7 +70,7 @@ class LevelManager(EnvironmentInterface):
        self.should_reset_agent_state_after_time_limit_passes = should_reset_agent_state_after_time_limit_passes
        self.full_name_id = self.name = name
        self._phase = RunPhase.HEATUP
-        self.level_was_reset = True
+        self.reset_required = False

        # set self as the parent for all the composite agents
        for agent in self.agents.values():
@@ -104,7 +104,7 @@ class LevelManager(EnvironmentInterface):
        :return: the environment response as returned in get_last_env_response
        """
        [agent.reset_internal_state() for agent in self.agents.values()]
-        self.level_was_reset = True
+        self.reset_required = False
        if self.real_environment.current_episode_steps_counter == 0:
            self.last_env_response = self.real_environment.last_env_response
        return self.last_env_response
@@ -203,6 +203,9 @@ class LevelManager(EnvironmentInterface):
            for agent_name, agent in self.agents.items():
                agent.set_incoming_directive(action)

+        if self.reset_required:
+            self.reset_internal_state()
+
        # get last response or initial response from the environment
        env_response = copy.copy(self.environment.last_env_response)

@@ -238,7 +241,7 @@ class LevelManager(EnvironmentInterface):
            # this is the agent's only opportunity to observe this transition - he will not get another one
            acting_agent.observe(env_response)  # TODO: acting agent? maybe all of the agents in the layer?
            self.handle_episode_ended()
-            self.reset_internal_state()
+            self.reset_required = True

        return env_response_for_upper_level