diff --git a/agents/agent.py b/agents/agent.py index ddba3ef..0ab4db3 100644 --- a/agents/agent.py +++ b/agents/agent.py @@ -550,9 +550,10 @@ class Agent(object): if current_snapshot_period > model_snapshots_periods_passed: model_snapshots_periods_passed = current_snapshot_period self.save_model(model_snapshots_periods_passed) - to_pickle(self.running_observation_stats, - os.path.join(self.tp.save_model_dir, - "running_stats.p".format(model_snapshots_periods_passed))) + if self.running_observation_stats is not None: + to_pickle(self.running_observation_stats, + os.path.join(self.tp.save_model_dir, + "running_stats.p".format(model_snapshots_periods_passed))) # play and record in replay buffer if self.tp.agent.collect_new_data: diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py index f051b31..ad066ae 100644 --- a/agents/clipped_ppo_agent.py +++ b/agents/clipped_ppo_agent.py @@ -69,7 +69,7 @@ class ClippedPPOAgent(ActorCriticAgent): screen.warning("WARNING: The requested policy gradient rescaler is not available") # standardize - advantages = (advantages - np.mean(advantages)) / np.std(advantages) + advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) for transition, advantage, value_target in zip(batch, advantages, value_targets): transition.info['advantage'] = advantage diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py index ef026e6..7388587 100644 --- a/architectures/network_wrapper.py +++ b/architectures/network_wrapper.py @@ -81,6 +81,7 @@ class NetworkWrapper(object): variables_to_restore = tf.global_variables() variables_to_restore = [v for v in variables_to_restore if '/online' in v.name] self.model_saver = tf.train.Saver(variables_to_restore) + #, max_to_keep=None) # uncomment to unlimit number of stored checkpoints if self.tp.sess and self.tp.checkpoint_restore_dir: checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir) screen.log_title("Loading checkpoint: {}".format(checkpoint)) diff --git a/architectures/tensorflow_components/embedders.py b/architectures/tensorflow_components/embedders.py index 880de2f..b3f36cb 100644 --- a/architectures/tensorflow_components/embedders.py +++ b/architectures/tensorflow_components/embedders.py @@ -15,18 +15,20 @@ # import tensorflow as tf -from configurations import EmbedderComplexity +from configurations import EmbedderDepth, EmbedderWidth class InputEmbedder(object): def __init__(self, input_size, activation_function=tf.nn.relu, - embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): + embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, + name="embedder"): self.name = name self.input_size = input_size self.activation_function = activation_function self.input = None self.output = None - self.embedder_complexity = embedder_complexity + self.embedder_depth = embedder_depth + self.embedder_width = embedder_width def __call__(self, prev_input_placeholder=None): with tf.variable_scope(self.get_name()): @@ -47,15 +49,16 @@ class InputEmbedder(object): class ImageEmbedder(InputEmbedder): def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu, - embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): - InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name) + embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, + name="embedder"): + InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name) self.input_rescaler = input_rescaler def _build_module(self): # image observation rescaled_observation_stack = self.input / self.input_rescaler - if self.embedder_complexity == EmbedderComplexity.Shallow: + if self.embedder_depth == EmbedderDepth.Shallow: # same embedder as used in the original DQN paper self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, filters=32, kernel_size=(8, 8), strides=(4, 4), @@ -73,7 +76,7 @@ class ImageEmbedder(InputEmbedder): self.output = tf.contrib.layers.flatten(self.observation_conv3) - elif self.embedder_complexity == EmbedderComplexity.Deep: + elif self.embedder_depth == EmbedderDepth.Deep: # the embedder used in the CARLA papers self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, filters=32, kernel_size=(5, 5), strides=(2, 2), @@ -115,24 +118,27 @@ class ImageEmbedder(InputEmbedder): class VectorEmbedder(InputEmbedder): def __init__(self, input_size, activation_function=tf.nn.relu, - embedder_complexity=EmbedderComplexity.Shallow, name="embedder"): - InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name) + embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, + name="embedder"): + InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name) def _build_module(self): # vector observation input_layer = tf.contrib.layers.flatten(self.input) - if self.embedder_complexity == EmbedderComplexity.Shallow: - self.output = tf.layers.dense(input_layer, 256, activation=self.activation_function, + width = 128 if self.embedder_width == EmbedderWidth.Wide else 32 + + if self.embedder_depth == EmbedderDepth.Shallow: + self.output = tf.layers.dense(input_layer, 2*width, activation=self.activation_function, name='fc1') - elif self.embedder_complexity == EmbedderComplexity.Deep: + elif self.embedder_depth == EmbedderDepth.Deep: # the embedder used in the CARLA papers - self.observation_fc1 = tf.layers.dense(input_layer, 128, activation=self.activation_function, + self.observation_fc1 = tf.layers.dense(input_layer, width, activation=self.activation_function, name='fc1') - self.observation_fc2 = tf.layers.dense(self.observation_fc1, 128, activation=self.activation_function, + self.observation_fc2 = tf.layers.dense(self.observation_fc1, width, activation=self.activation_function, name='fc2') - self.output = tf.layers.dense(self.observation_fc2, 128, activation=self.activation_function, + self.output = tf.layers.dense(self.observation_fc2, width, activation=self.activation_function, name='fc3') else: raise ValueError("The defined embedder complexity value is invalid") diff --git a/architectures/tensorflow_components/general_network.py b/architectures/tensorflow_components/general_network.py index 03bb2a9..a4e69ff 100644 --- a/architectures/tensorflow_components/general_network.py +++ b/architectures/tensorflow_components/general_network.py @@ -36,6 +36,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): self.output_heads = [] self.activation_function = self.get_activation_function( tuning_parameters.agent.hidden_layers_activation_function) + self.embedder_width = tuning_parameters.agent.embedder_width TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) @@ -57,22 +58,26 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): def get_observation_embedding(with_timestep=False): if self.input_height > 1: return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", - input_rescaler=self.tp.agent.input_rescaler) + input_rescaler=self.tp.agent.input_rescaler, embedder_width=self.embedder_width) else: - return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") + return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation", + embedder_width=self.embedder_width) input_mapping = { InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), name="action"), + InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements", + embedder_width=self.embedder_width), + InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector", + embedder_width=self.embedder_width), + InputTypes.Action: VectorEmbedder((self.num_actions,), name="action", + embedder_width=self.embedder_width), InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), } return input_mapping[embedder_type] def get_middleware_embedder(self, middleware_type): return {MiddlewareTypes.LSTM: LSTM_Embedder, - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) + MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function, self.embedder_width) def get_output_head(self, head_type, head_idx, loss_weight=1.): output_mapping = { @@ -174,7 +179,8 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): self.losses = tf.losses.get_losses(self.name) self.losses += tf.losses.get_regularization_losses(self.name) self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.name) - tf.summary.scalar('total_loss', self.total_loss) + if self.tp.visualization.tensorboard: + tf.summary.scalar('total_loss', self.total_loss) # Learning rate diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py index 616ab13..b463d7f 100644 --- a/architectures/tensorflow_components/heads.py +++ b/architectures/tensorflow_components/heads.py @@ -395,7 +395,6 @@ class PPOHead(Head): def _build_module(self, input_layer): eps = 1e-15 - if self.discrete_controls: self.actions = tf.placeholder(tf.int32, [None], name="actions") else: @@ -410,7 +409,7 @@ class PPOHead(Head): self.policy_mean = tf.nn.softmax(policy_values, name="policy") # define the distributions for the policy and the old policy - self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean) + self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_mean + eps)) self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean) self.output = self.policy_mean @@ -445,7 +444,7 @@ class PPOHead(Head): # calculate surrogate loss self.advantages = tf.placeholder(tf.float32, [None], name="advantages") self.target = self.advantages - self.likelihood_ratio = self.action_probs_wrt_policy / self.action_probs_wrt_old_policy + self.likelihood_ratio = self.action_probs_wrt_policy / (self.action_probs_wrt_old_policy + eps) if self.clip_likelihood_ratio_using_epsilon is not None: max_value = 1 + self.clip_likelihood_ratio_using_epsilon min_value = 1 - self.clip_likelihood_ratio_using_epsilon diff --git a/architectures/tensorflow_components/middleware.py b/architectures/tensorflow_components/middleware.py index dfe1597..eee5925 100644 --- a/architectures/tensorflow_components/middleware.py +++ b/architectures/tensorflow_components/middleware.py @@ -16,13 +16,15 @@ import tensorflow as tf import numpy as np +from configurations import EmbedderWidth class MiddlewareEmbedder(object): - def __init__(self, activation_function=tf.nn.relu, name="middleware_embedder"): + def __init__(self, activation_function=tf.nn.relu, embedder_width=EmbedderWidth.Wide, name="middleware_embedder"): self.name = name self.input = None self.output = None + self.embedder_width = embedder_width self.activation_function = activation_function def __call__(self, input_layer): @@ -70,4 +72,6 @@ class LSTM_Embedder(MiddlewareEmbedder): class FC_Embedder(MiddlewareEmbedder): def _build_module(self): - self.output = tf.layers.dense(self.input, 512, activation=self.activation_function, name='fc1') + width = 512 if self.embedder_width == EmbedderWidth.Wide else 64 + self.output = tf.layers.dense(self.input, width, activation=self.activation_function, name='fc1') + diff --git a/configurations.py b/configurations.py index 5e553d8..a235c6c 100644 --- a/configurations.py +++ b/configurations.py @@ -32,11 +32,6 @@ class InputTypes(object): TimedObservation = 5 -class EmbedderComplexity(object): - Shallow = 1 - Deep = 2 - - class OutputTypes(object): Q = 1 DuelingQ = 2 @@ -51,6 +46,17 @@ class OutputTypes(object): QuantileRegressionQ = 11 + +class EmbedderDepth(object): + Shallow = 1 + Deep = 2 + + +class EmbedderWidth(object): + Narrow = 1 + Wide = 2 + + class MiddlewareTypes(object): LSTM = 1 FC = 2 @@ -82,7 +88,8 @@ class AgentParameters(Parameters): middleware_type = MiddlewareTypes.FC loss_weights = [1.0] stop_gradients_from_head = [False] - embedder_complexity = EmbedderComplexity.Shallow + embedder_depth = EmbedderDepth.Shallow + embedder_width = EmbedderWidth.Wide num_output_head_copies = 1 use_measurements = False use_accumulated_reward_as_measurement = False diff --git a/parallel_actor.py b/parallel_actor.py index c988e49..ac51649 100644 --- a/parallel_actor.py +++ b/parallel_actor.py @@ -128,11 +128,14 @@ if __name__ == "__main__": def init_fn(scaffold, session): session.run(init_all_op) + + #saver = tf.train.Saver(max_to_keep=None) # uncomment to unlimit number of stored checkpoints scaffold = tf.train.Scaffold(init_op=init_all_op, init_fn=init_fn, ready_op=ready_op, ready_for_local_init_op=ready_for_local_init_op, local_init_op=local_init_op) + #saver=saver) # uncomment to unlimit number of stored checkpoints # Due to awkward tensorflow behavior where the same variable is used to decide whether to restore a model # (and where from), or just save the model (and where to), we employ the below. In case where a restore folder @@ -156,6 +159,10 @@ if __name__ == "__main__": tuning_parameters.sess = sess for network in agent.networks: network.set_session(sess) + # if hasattr(network.global_network, 'lock_init'): + # sess.run(network.global_network.lock_init) + # if hasattr(network.global_network, 'release_init'): + # sess.run(network.global_network.release_init) if tuning_parameters.visualization.tensorboard: # Write the merged summaries to the current experiment directory diff --git a/presets.py b/presets.py index 21841e9..ae73a92 100644 --- a/presets.py +++ b/presets.py @@ -664,8 +664,11 @@ class Humanoid_ClippedPPO(Preset): def __init__(self): Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) self.env.level = 'Humanoid-v1' - self.learning_rate = 0.0001 + self.agent.embedder_width = EmbedderWidth.Narrow + self.learning_rate = 0.00001 self.num_heatup_steps = 0 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 1 self.agent.num_consecutive_training_steps = 1 self.agent.num_consecutive_playing_steps = 2048 self.agent.discount = 0.99 @@ -1337,7 +1340,7 @@ class Breakout_A3C(Preset): class Carla_A3C(Preset): def __init__(self): Preset.__init__(self, ActorCritic, Carla, EntropyExploration) - self.agent.embedder_complexity = EmbedderComplexity.Deep + self.agent.embedder_complexity = EmbedderDepth.Deep self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -1354,7 +1357,7 @@ class Carla_A3C(Preset): class Carla_DDPG(Preset): def __init__(self): Preset.__init__(self, DDPG, Carla, OUExploration) - self.agent.embedder_complexity = EmbedderComplexity.Deep + self.agent.embedder_complexity = EmbedderDepth.Deep self.learning_rate = 0.0001 self.num_heatup_steps = 1000 self.agent.num_consecutive_training_steps = 5 @@ -1363,7 +1366,7 @@ class Carla_DDPG(Preset): class Carla_BC(Preset): def __init__(self): Preset.__init__(self, BC, Carla, ExplorationParameters) - self.agent.embedder_complexity = EmbedderComplexity.Deep + self.agent.embedder_complexity = EmbedderDepth.Deep self.agent.load_memory_from_file_path = 'datasets/carla_town1.p' self.learning_rate = 0.0005 self.num_heatup_steps = 0