1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 11:10:20 +01:00

Parallel agents fixes (#95)

* Parallel agents related bug fixes: checkpoint restore, tensorboard integration.
Adding narrow networks support.
Reference code for unlimited number of checkpoints
This commit is contained in:
Itai Caspi
2018-05-24 14:24:19 +03:00
committed by GitHub
parent 6c0b59b4de
commit d302168c8c
10 changed files with 75 additions and 41 deletions

View File

@@ -550,9 +550,10 @@ class Agent(object):
if current_snapshot_period > model_snapshots_periods_passed: if current_snapshot_period > model_snapshots_periods_passed:
model_snapshots_periods_passed = current_snapshot_period model_snapshots_periods_passed = current_snapshot_period
self.save_model(model_snapshots_periods_passed) self.save_model(model_snapshots_periods_passed)
to_pickle(self.running_observation_stats, if self.running_observation_stats is not None:
os.path.join(self.tp.save_model_dir, to_pickle(self.running_observation_stats,
"running_stats.p".format(model_snapshots_periods_passed))) os.path.join(self.tp.save_model_dir,
"running_stats.p".format(model_snapshots_periods_passed)))
# play and record in replay buffer # play and record in replay buffer
if self.tp.agent.collect_new_data: if self.tp.agent.collect_new_data:

View File

@@ -69,7 +69,7 @@ class ClippedPPOAgent(ActorCriticAgent):
screen.warning("WARNING: The requested policy gradient rescaler is not available") screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize # standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)
for transition, advantage, value_target in zip(batch, advantages, value_targets): for transition, advantage, value_target in zip(batch, advantages, value_targets):
transition.info['advantage'] = advantage transition.info['advantage'] = advantage

View File

@@ -81,6 +81,7 @@ class NetworkWrapper(object):
variables_to_restore = tf.global_variables() variables_to_restore = tf.global_variables()
variables_to_restore = [v for v in variables_to_restore if '/online' in v.name] variables_to_restore = [v for v in variables_to_restore if '/online' in v.name]
self.model_saver = tf.train.Saver(variables_to_restore) self.model_saver = tf.train.Saver(variables_to_restore)
#, max_to_keep=None) # uncomment to unlimit number of stored checkpoints
if self.tp.sess and self.tp.checkpoint_restore_dir: if self.tp.sess and self.tp.checkpoint_restore_dir:
checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir) checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir)
screen.log_title("Loading checkpoint: {}".format(checkpoint)) screen.log_title("Loading checkpoint: {}".format(checkpoint))

View File

@@ -15,18 +15,20 @@
# #
import tensorflow as tf import tensorflow as tf
from configurations import EmbedderComplexity from configurations import EmbedderDepth, EmbedderWidth
class InputEmbedder(object): class InputEmbedder(object):
def __init__(self, input_size, activation_function=tf.nn.relu, def __init__(self, input_size, activation_function=tf.nn.relu,
embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide,
name="embedder"):
self.name = name self.name = name
self.input_size = input_size self.input_size = input_size
self.activation_function = activation_function self.activation_function = activation_function
self.input = None self.input = None
self.output = None self.output = None
self.embedder_complexity = embedder_complexity self.embedder_depth = embedder_depth
self.embedder_width = embedder_width
def __call__(self, prev_input_placeholder=None): def __call__(self, prev_input_placeholder=None):
with tf.variable_scope(self.get_name()): with tf.variable_scope(self.get_name()):
@@ -47,15 +49,16 @@ class InputEmbedder(object):
class ImageEmbedder(InputEmbedder): class ImageEmbedder(InputEmbedder):
def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu, def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu,
embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide,
InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name) name="embedder"):
InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name)
self.input_rescaler = input_rescaler self.input_rescaler = input_rescaler
def _build_module(self): def _build_module(self):
# image observation # image observation
rescaled_observation_stack = self.input / self.input_rescaler rescaled_observation_stack = self.input / self.input_rescaler
if self.embedder_complexity == EmbedderComplexity.Shallow: if self.embedder_depth == EmbedderDepth.Shallow:
# same embedder as used in the original DQN paper # same embedder as used in the original DQN paper
self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack,
filters=32, kernel_size=(8, 8), strides=(4, 4), filters=32, kernel_size=(8, 8), strides=(4, 4),
@@ -73,7 +76,7 @@ class ImageEmbedder(InputEmbedder):
self.output = tf.contrib.layers.flatten(self.observation_conv3) self.output = tf.contrib.layers.flatten(self.observation_conv3)
elif self.embedder_complexity == EmbedderComplexity.Deep: elif self.embedder_depth == EmbedderDepth.Deep:
# the embedder used in the CARLA papers # the embedder used in the CARLA papers
self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack,
filters=32, kernel_size=(5, 5), strides=(2, 2), filters=32, kernel_size=(5, 5), strides=(2, 2),
@@ -115,24 +118,27 @@ class ImageEmbedder(InputEmbedder):
class VectorEmbedder(InputEmbedder): class VectorEmbedder(InputEmbedder):
def __init__(self, input_size, activation_function=tf.nn.relu, def __init__(self, input_size, activation_function=tf.nn.relu,
embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide,
InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name) name="embedder"):
InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name)
def _build_module(self): def _build_module(self):
# vector observation # vector observation
input_layer = tf.contrib.layers.flatten(self.input) input_layer = tf.contrib.layers.flatten(self.input)
if self.embedder_complexity == EmbedderComplexity.Shallow: width = 128 if self.embedder_width == EmbedderWidth.Wide else 32
self.output = tf.layers.dense(input_layer, 256, activation=self.activation_function,
if self.embedder_depth == EmbedderDepth.Shallow:
self.output = tf.layers.dense(input_layer, 2*width, activation=self.activation_function,
name='fc1') name='fc1')
elif self.embedder_complexity == EmbedderComplexity.Deep: elif self.embedder_depth == EmbedderDepth.Deep:
# the embedder used in the CARLA papers # the embedder used in the CARLA papers
self.observation_fc1 = tf.layers.dense(input_layer, 128, activation=self.activation_function, self.observation_fc1 = tf.layers.dense(input_layer, width, activation=self.activation_function,
name='fc1') name='fc1')
self.observation_fc2 = tf.layers.dense(self.observation_fc1, 128, activation=self.activation_function, self.observation_fc2 = tf.layers.dense(self.observation_fc1, width, activation=self.activation_function,
name='fc2') name='fc2')
self.output = tf.layers.dense(self.observation_fc2, 128, activation=self.activation_function, self.output = tf.layers.dense(self.observation_fc2, width, activation=self.activation_function,
name='fc3') name='fc3')
else: else:
raise ValueError("The defined embedder complexity value is invalid") raise ValueError("The defined embedder complexity value is invalid")

View File

@@ -36,6 +36,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
self.output_heads = [] self.output_heads = []
self.activation_function = self.get_activation_function( self.activation_function = self.get_activation_function(
tuning_parameters.agent.hidden_layers_activation_function) tuning_parameters.agent.hidden_layers_activation_function)
self.embedder_width = tuning_parameters.agent.embedder_width
TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local)
@@ -57,22 +58,26 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
def get_observation_embedding(with_timestep=False): def get_observation_embedding(with_timestep=False):
if self.input_height > 1: if self.input_height > 1:
return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation",
input_rescaler=self.tp.agent.input_rescaler) input_rescaler=self.tp.agent.input_rescaler, embedder_width=self.embedder_width)
else: else:
return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation",
embedder_width=self.embedder_width)
input_mapping = { input_mapping = {
InputTypes.Observation: get_observation_embedding(), InputTypes.Observation: get_observation_embedding(),
InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements"), InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements",
InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector"), embedder_width=self.embedder_width),
InputTypes.Action: VectorEmbedder((self.num_actions,), name="action"), InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector",
embedder_width=self.embedder_width),
InputTypes.Action: VectorEmbedder((self.num_actions,), name="action",
embedder_width=self.embedder_width),
InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), InputTypes.TimedObservation: get_observation_embedding(with_timestep=True),
} }
return input_mapping[embedder_type] return input_mapping[embedder_type]
def get_middleware_embedder(self, middleware_type): def get_middleware_embedder(self, middleware_type):
return {MiddlewareTypes.LSTM: LSTM_Embedder, return {MiddlewareTypes.LSTM: LSTM_Embedder,
MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function, self.embedder_width)
def get_output_head(self, head_type, head_idx, loss_weight=1.): def get_output_head(self, head_type, head_idx, loss_weight=1.):
output_mapping = { output_mapping = {
@@ -174,7 +179,8 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
self.losses = tf.losses.get_losses(self.name) self.losses = tf.losses.get_losses(self.name)
self.losses += tf.losses.get_regularization_losses(self.name) self.losses += tf.losses.get_regularization_losses(self.name)
self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.name) self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.name)
tf.summary.scalar('total_loss', self.total_loss) if self.tp.visualization.tensorboard:
tf.summary.scalar('total_loss', self.total_loss)
# Learning rate # Learning rate

View File

@@ -395,7 +395,6 @@ class PPOHead(Head):
def _build_module(self, input_layer): def _build_module(self, input_layer):
eps = 1e-15 eps = 1e-15
if self.discrete_controls: if self.discrete_controls:
self.actions = tf.placeholder(tf.int32, [None], name="actions") self.actions = tf.placeholder(tf.int32, [None], name="actions")
else: else:
@@ -410,7 +409,7 @@ class PPOHead(Head):
self.policy_mean = tf.nn.softmax(policy_values, name="policy") self.policy_mean = tf.nn.softmax(policy_values, name="policy")
# define the distributions for the policy and the old policy # define the distributions for the policy and the old policy
self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean) self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_mean + eps))
self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean) self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
self.output = self.policy_mean self.output = self.policy_mean
@@ -445,7 +444,7 @@ class PPOHead(Head):
# calculate surrogate loss # calculate surrogate loss
self.advantages = tf.placeholder(tf.float32, [None], name="advantages") self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.target = self.advantages self.target = self.advantages
self.likelihood_ratio = self.action_probs_wrt_policy / self.action_probs_wrt_old_policy self.likelihood_ratio = self.action_probs_wrt_policy / (self.action_probs_wrt_old_policy + eps)
if self.clip_likelihood_ratio_using_epsilon is not None: if self.clip_likelihood_ratio_using_epsilon is not None:
max_value = 1 + self.clip_likelihood_ratio_using_epsilon max_value = 1 + self.clip_likelihood_ratio_using_epsilon
min_value = 1 - self.clip_likelihood_ratio_using_epsilon min_value = 1 - self.clip_likelihood_ratio_using_epsilon

View File

@@ -16,13 +16,15 @@
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from configurations import EmbedderWidth
class MiddlewareEmbedder(object): class MiddlewareEmbedder(object):
def __init__(self, activation_function=tf.nn.relu, name="middleware_embedder"): def __init__(self, activation_function=tf.nn.relu, embedder_width=EmbedderWidth.Wide, name="middleware_embedder"):
self.name = name self.name = name
self.input = None self.input = None
self.output = None self.output = None
self.embedder_width = embedder_width
self.activation_function = activation_function self.activation_function = activation_function
def __call__(self, input_layer): def __call__(self, input_layer):
@@ -70,4 +72,6 @@ class LSTM_Embedder(MiddlewareEmbedder):
class FC_Embedder(MiddlewareEmbedder): class FC_Embedder(MiddlewareEmbedder):
def _build_module(self): def _build_module(self):
self.output = tf.layers.dense(self.input, 512, activation=self.activation_function, name='fc1') width = 512 if self.embedder_width == EmbedderWidth.Wide else 64
self.output = tf.layers.dense(self.input, width, activation=self.activation_function, name='fc1')

View File

@@ -32,11 +32,6 @@ class InputTypes(object):
TimedObservation = 5 TimedObservation = 5
class EmbedderComplexity(object):
Shallow = 1
Deep = 2
class OutputTypes(object): class OutputTypes(object):
Q = 1 Q = 1
DuelingQ = 2 DuelingQ = 2
@@ -51,6 +46,17 @@ class OutputTypes(object):
QuantileRegressionQ = 11 QuantileRegressionQ = 11
class EmbedderDepth(object):
Shallow = 1
Deep = 2
class EmbedderWidth(object):
Narrow = 1
Wide = 2
class MiddlewareTypes(object): class MiddlewareTypes(object):
LSTM = 1 LSTM = 1
FC = 2 FC = 2
@@ -82,7 +88,8 @@ class AgentParameters(Parameters):
middleware_type = MiddlewareTypes.FC middleware_type = MiddlewareTypes.FC
loss_weights = [1.0] loss_weights = [1.0]
stop_gradients_from_head = [False] stop_gradients_from_head = [False]
embedder_complexity = EmbedderComplexity.Shallow embedder_depth = EmbedderDepth.Shallow
embedder_width = EmbedderWidth.Wide
num_output_head_copies = 1 num_output_head_copies = 1
use_measurements = False use_measurements = False
use_accumulated_reward_as_measurement = False use_accumulated_reward_as_measurement = False

View File

@@ -128,11 +128,14 @@ if __name__ == "__main__":
def init_fn(scaffold, session): def init_fn(scaffold, session):
session.run(init_all_op) session.run(init_all_op)
#saver = tf.train.Saver(max_to_keep=None) # uncomment to unlimit number of stored checkpoints
scaffold = tf.train.Scaffold(init_op=init_all_op, scaffold = tf.train.Scaffold(init_op=init_all_op,
init_fn=init_fn, init_fn=init_fn,
ready_op=ready_op, ready_op=ready_op,
ready_for_local_init_op=ready_for_local_init_op, ready_for_local_init_op=ready_for_local_init_op,
local_init_op=local_init_op) local_init_op=local_init_op)
#saver=saver) # uncomment to unlimit number of stored checkpoints
# Due to awkward tensorflow behavior where the same variable is used to decide whether to restore a model # Due to awkward tensorflow behavior where the same variable is used to decide whether to restore a model
# (and where from), or just save the model (and where to), we employ the below. In case where a restore folder # (and where from), or just save the model (and where to), we employ the below. In case where a restore folder
@@ -156,6 +159,10 @@ if __name__ == "__main__":
tuning_parameters.sess = sess tuning_parameters.sess = sess
for network in agent.networks: for network in agent.networks:
network.set_session(sess) network.set_session(sess)
# if hasattr(network.global_network, 'lock_init'):
# sess.run(network.global_network.lock_init)
# if hasattr(network.global_network, 'release_init'):
# sess.run(network.global_network.release_init)
if tuning_parameters.visualization.tensorboard: if tuning_parameters.visualization.tensorboard:
# Write the merged summaries to the current experiment directory # Write the merged summaries to the current experiment directory

View File

@@ -664,8 +664,11 @@ class Humanoid_ClippedPPO(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters)
self.env.level = 'Humanoid-v1' self.env.level = 'Humanoid-v1'
self.learning_rate = 0.0001 self.agent.embedder_width = EmbedderWidth.Narrow
self.learning_rate = 0.00001
self.num_heatup_steps = 0 self.num_heatup_steps = 0
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 1
self.agent.num_consecutive_training_steps = 1 self.agent.num_consecutive_training_steps = 1
self.agent.num_consecutive_playing_steps = 2048 self.agent.num_consecutive_playing_steps = 2048
self.agent.discount = 0.99 self.agent.discount = 0.99
@@ -1337,7 +1340,7 @@ class Breakout_A3C(Preset):
class Carla_A3C(Preset): class Carla_A3C(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, ActorCritic, Carla, EntropyExploration) Preset.__init__(self, ActorCritic, Carla, EntropyExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep self.agent.embedder_complexity = EmbedderDepth.Deep
self.agent.policy_gradient_rescaler = 'GAE' self.agent.policy_gradient_rescaler = 'GAE'
self.learning_rate = 0.0001 self.learning_rate = 0.0001
self.num_heatup_steps = 0 self.num_heatup_steps = 0
@@ -1354,7 +1357,7 @@ class Carla_A3C(Preset):
class Carla_DDPG(Preset): class Carla_DDPG(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, DDPG, Carla, OUExploration) Preset.__init__(self, DDPG, Carla, OUExploration)
self.agent.embedder_complexity = EmbedderComplexity.Deep self.agent.embedder_complexity = EmbedderDepth.Deep
self.learning_rate = 0.0001 self.learning_rate = 0.0001
self.num_heatup_steps = 1000 self.num_heatup_steps = 1000
self.agent.num_consecutive_training_steps = 5 self.agent.num_consecutive_training_steps = 5
@@ -1363,7 +1366,7 @@ class Carla_DDPG(Preset):
class Carla_BC(Preset): class Carla_BC(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, BC, Carla, ExplorationParameters) Preset.__init__(self, BC, Carla, ExplorationParameters)
self.agent.embedder_complexity = EmbedderComplexity.Deep self.agent.embedder_complexity = EmbedderDepth.Deep
self.agent.load_memory_from_file_path = 'datasets/carla_town1.p' self.agent.load_memory_from_file_path = 'datasets/carla_town1.p'
self.learning_rate = 0.0005 self.learning_rate = 0.0005
self.num_heatup_steps = 0 self.num_heatup_steps = 0