mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 11:40:18 +01:00
Itaicaspi/episode reset refactoring (#105)
* reordering of the episode reset operation and allowing to store episodes only when they are terminated * reordering of the episode reset operation and allowing to store episodes only when they are terminated * revert tensorflow-gpu to 1.9.0 + bug fix in should_train() * tests readme file and refactoring of policy optimization agent train function * Update README.md * Update README.md * additional policy optimization train function simplifications * Updated the traces after the reordering of the environment reset * docker and jenkins files * updated the traces to the ones from within the docker container * updated traces and added control suite to the docker * updated jenkins file with the intel proxy + updated doom basic a3c test params * updated line breaks in jenkins file * added a missing line break in jenkins file * refining trace tests ignored presets + adding a configurable beta entropy value * switch the order of trace and golden tests in jenkins + fix golden tests processes not killed issue * updated benchmarks for dueling ddqn breakout and pong * allowing dynamic updates to the loss weights + bug fix in episode.update_returns * remove docker and jenkins file
This commit is contained in:
@@ -46,7 +46,7 @@ class DuelingQHead(QHead):
|
||||
with tf.variable_scope("action_advantage"):
|
||||
self.action_advantage = self.dense_layer(512)(input_layer, activation=self.activation_function, name='fc1')
|
||||
self.action_advantage = self.dense_layer(self.num_actions)(self.action_advantage, name='fc2')
|
||||
self.action_mean = tf.reduce_mean(self.action_advantage, axis=1, keep_dims=True)
|
||||
self.action_mean = tf.reduce_mean(self.action_advantage, axis=1, keepdims=True)
|
||||
self.action_advantage = self.action_advantage - self.action_mean
|
||||
|
||||
# merge to state-action value function Q
|
||||
|
||||
@@ -59,7 +59,10 @@ class Head(object):
|
||||
self.loss = []
|
||||
self.loss_type = []
|
||||
self.regularizations = []
|
||||
self.loss_weight = force_list(loss_weight)
|
||||
# self.loss_weight = force_list(loss_weight)
|
||||
self.loss_weight = tf.Variable(force_list(loss_weight), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
self.loss_weight_placeholder = tf.placeholder("float")
|
||||
self.set_loss_weight = tf.assign(self.loss_weight, self.loss_weight_placeholder)
|
||||
self.target = []
|
||||
self.importance_weight = []
|
||||
self.input = []
|
||||
|
||||
@@ -48,7 +48,11 @@ class PolicyHead(Head):
|
||||
|
||||
# a scalar weight that penalizes low entropy values to encourage exploration
|
||||
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
|
||||
self.beta = agent_parameters.algorithm.beta_entropy
|
||||
# we set the beta value as a tf variable so it can be updated later if needed
|
||||
self.beta = tf.Variable(agent_parameters.algorithm.beta_entropy,
|
||||
trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
self.beta_placeholder = tf.placeholder('float')
|
||||
self.set_beta = tf.assign(self.beta, self.beta_placeholder)
|
||||
|
||||
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
|
||||
if hasattr(agent_parameters.algorithm, 'action_penalty'):
|
||||
|
||||
Reference in New Issue
Block a user