1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

Itaicaspi/episode reset refactoring (#105)

* reordering of the episode reset operation and allowing to store episodes only when they are terminated

* reordering of the episode reset operation and allowing to store episodes only when they are terminated

* revert tensorflow-gpu to 1.9.0 + bug fix in should_train()

* tests readme file and refactoring of policy optimization agent train function

* Update README.md

* Update README.md

* additional policy optimization train function simplifications

* Updated the traces after the reordering of the environment reset

* docker and jenkins files

* updated the traces to the ones from within the docker container

* updated traces and added control suite to the docker

* updated jenkins file with the intel proxy + updated doom basic a3c test params

* updated line breaks in jenkins file

* added a missing line break in jenkins file

* refining trace tests ignored presets + adding a configurable beta entropy value

* switch the order of trace and golden tests in jenkins + fix golden tests processes not killed issue

* updated benchmarks for dueling ddqn breakout and pong

* allowing dynamic updates to the loss weights + bug fix in episode.update_returns

* remove docker and jenkins file
This commit is contained in:
Itai Caspi
2018-09-04 15:07:54 +03:00
committed by GitHub
parent 7086492127
commit 72a1d9d426
92 changed files with 9803 additions and 9740 deletions

View File

@@ -82,19 +82,13 @@ class PolicyOptimizationAgent(Agent):
self.mean_discounted_return = np.mean(episode_discounted_returns)
self.std_discounted_return = np.std(episode_discounted_returns)
def get_current_episode(self):
# we get the episode most of the time from the current episode buffer and only in the last transition from the
# "memory" (where is was stored in the end of the episode)
return self.memory.get_episode(0) or self.current_episode_buffer
def train(self):
episode = self.get_current_episode()
episode = self.current_episode_buffer
# check if we should calculate gradients or skip
episode_ended = episode.is_complete
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
if not (is_t_max_steps_passed or episode_ended):
if not (is_t_max_steps_passed or episode.is_complete):
return 0
total_loss = 0
@@ -105,14 +99,15 @@ class PolicyOptimizationAgent(Agent):
# get t_max transitions or less if the we got to a terminal state
# will be used for both actor-critic and vanilla PG.
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
transitions = []
start_idx = self.last_gradient_update_step_idx
end_idx = episode.length()
# In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
transitions = episode[self.last_gradient_update_step_idx:]
batch = Batch(transitions)
for idx in range(start_idx, end_idx):
transitions.append(episode.get_transition(idx))
self.last_gradient_update_step_idx = end_idx
# move the pointer for the last update step
if episode.is_complete:
self.last_gradient_update_step_idx = 0
else:
self.last_gradient_update_step_idx = episode.length()
# update the statistics for the variance reduction techniques
if self.policy_gradient_rescaler in \
@@ -120,21 +115,17 @@ class PolicyOptimizationAgent(Agent):
PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
self.update_episode_statistics(episode)
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
batch = Batch(transitions)
# accumulate the gradients
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
# apply the gradients once in every apply_gradients_every_x_episodes episodes
if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
for network in self.networks.values():
network.apply_gradients_and_sync_networks()
self.training_iteration += 1
# move the pointer to the next episode start and discard the episode.
if episode_ended:
# we need to remove the episode, because the next training iteration will be called before storing any
# additional transitions in the memory (we don't store a transition for the first call to observe), so the
# length of the memory won't be enforced and the old episode won't be removed
self.call_memory('remove_episode', 0)
self.last_gradient_update_step_idx = 0
# run additional commands after the training is done
self.post_training_commands()
return total_loss