Itaicaspi/episode reset refactoring (#105)

* reordering of the episode reset operation and allowing to store episodes only when they are terminated * reordering of the episode reset operation and allowing to store episodes only when they are terminated * revert tensorflow-gpu to 1.9.0 + bug fix in should_train() * tests readme file and refactoring of policy optimization agent train function * Update README.md * Update README.md * additional policy optimization train function simplifications * Updated the traces after the reordering of the environment reset * docker and jenkins files * updated the traces to the ones from within the docker container * updated traces and added control suite to the docker * updated jenkins file with the intel proxy + updated doom basic a3c test params * updated line breaks in jenkins file * added a missing line break in jenkins file * refining trace tests ignored presets + adding a configurable beta entropy value * switch the order of trace and golden tests in jenkins + fix golden tests processes not killed issue * updated benchmarks for dueling ddqn breakout and pong * allowing dynamic updates to the loss weights + bug fix in episode.update_returns * remove docker and jenkins file
2026-02-17 23:05:51 +01:00 · 2018-09-04 15:07:54 +03:00
parent 7086492127
commit 72a1d9d426
92 changed files with 9803 additions and 9740 deletions
--- a/rl_coach/agents/policy_optimization_agent.py
+++ b/rl_coach/agents/policy_optimization_agent.py
@@ -82,19 +82,13 @@ class PolicyOptimizationAgent(Agent):
        self.mean_discounted_return = np.mean(episode_discounted_returns)
        self.std_discounted_return = np.std(episode_discounted_returns)

-    def get_current_episode(self):
-        # we get the episode most of the time from the current episode buffer and only in the last transition from the
-        # "memory" (where is was stored in the end of the episode)
-        return self.memory.get_episode(0) or self.current_episode_buffer
-
    def train(self):
-        episode = self.get_current_episode()
+        episode = self.current_episode_buffer

        # check if we should calculate gradients or skip
-        episode_ended = episode.is_complete
        num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
        is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
-        if not (is_t_max_steps_passed or episode_ended):
+        if not (is_t_max_steps_passed or episode.is_complete):
            return 0

        total_loss = 0
@@ -105,14 +99,15 @@ class PolicyOptimizationAgent(Agent):

            # get t_max transitions or less if the we got to a terminal state
            # will be used for both actor-critic and vanilla PG.
-            # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
-            transitions = []
-            start_idx = self.last_gradient_update_step_idx
-            end_idx = episode.length()
+            # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
+            transitions = episode[self.last_gradient_update_step_idx:]
+            batch = Batch(transitions)

-            for idx in range(start_idx, end_idx):
-                transitions.append(episode.get_transition(idx))
-            self.last_gradient_update_step_idx = end_idx
+            # move the pointer for the last update step
+            if episode.is_complete:
+                self.last_gradient_update_step_idx = 0
+            else:
+                self.last_gradient_update_step_idx = episode.length()

            # update the statistics for the variance reduction techniques
            if self.policy_gradient_rescaler in \
@@ -120,21 +115,17 @@ class PolicyOptimizationAgent(Agent):
                     PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
                self.update_episode_statistics(episode)

-            # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
-            batch = Batch(transitions)
+            # accumulate the gradients
            total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
+
+            # apply the gradients once in every apply_gradients_every_x_episodes episodes
            if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
                for network in self.networks.values():
                    network.apply_gradients_and_sync_networks()
            self.training_iteration += 1

-        # move the pointer to the next episode start and discard the episode.
-        if episode_ended:
-            # we need to remove the episode, because the next training iteration will be called before storing any
-            # additional transitions in the memory (we don't store a transition for the first call to observe), so the
-            # length of the memory won't be enforced and the old episode won't be removed
-            self.call_memory('remove_episode', 0)
-            self.last_gradient_update_step_idx = 0
+            # run additional commands after the training is done
+            self.post_training_commands()

        return total_loss