diff --git a/.gitignore b/.gitignore index 6fc79b4..bad8e2a 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,8 @@ roboschool *.orig docs/site coach_env +build +rl_coach.egg* +contrib +test_log_* +dist diff --git a/agents/agent.py b/agents/agent.py index b88c662..006544e 100644 --- a/agents/agent.py +++ b/agents/agent.py @@ -24,6 +24,8 @@ except: import copy from renderer import Renderer from configurations import Preset +from collections import deque +from utils import LazyStack from collections import OrderedDict from utils import RunPhase, Signal, is_empty, RunningStat from architectures import * @@ -214,6 +216,8 @@ class Agent(object): network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init + self.prepare_initial_state() + def preprocess_observation(self, observation): """ Preprocesses the given observation. @@ -291,9 +295,8 @@ class Agent(object): """ current_states = {} next_states = {} - - current_states['observation'] = np.array([transition.state['observation'] for transition in batch]) - next_states['observation'] = np.array([transition.next_state['observation'] for transition in batch]) + current_states['observation'] = np.array([np.array(transition.state['observation']) for transition in batch]) + next_states['observation'] = np.array([np.array(transition.next_state['observation']) for transition in batch]) actions = np.array([transition.action for transition in batch]) rewards = np.array([transition.reward for transition in batch]) game_overs = np.array([transition.game_over for transition in batch]) @@ -348,6 +351,23 @@ class Agent(object): for input_name in self.tp.agent.input_types.keys(): input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0) return input_state + + def prepare_initial_state(self): + """ + Create an initial state when starting a new episode + :return: None + """ + observation = self.preprocess_observation(self.env.state['observation']) + self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) + observation = LazyStack(self.curr_stack, -1) + + self.curr_state = { + 'observation': observation + } + if self.tp.agent.use_measurements: + self.curr_state['measurements'] = self.env.measurements + if self.tp.agent.use_accumulated_reward_as_measurement: + self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) def act(self, phase=RunPhase.TRAIN): """ @@ -356,34 +376,12 @@ class Agent(object): :return: A boolean value that signals an episode termination """ - self.total_steps_counter += 1 + if phase != RunPhase.TEST: + self.total_steps_counter += 1 self.current_episode_steps_counter += 1 # get new action - action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0} - is_first_transition_in_episode = (self.curr_state == {}) - if is_first_transition_in_episode: - if not isinstance(self.env.state, dict): - raise ValueError(( - 'expected state to be a dictionary, found {}' - ).format(type(self.env.state))) - - state = self.env.state - # TODO: modify preprocess_observation to modify the entire state - # for now, only preprocess the observation - state['observation'] = self.preprocess_observation(state['observation']) - - # TODO: provide option to stack more than just the observation - # TODO: this should probably be happening in an environment wrapper anyway - state['observation'] = stack_observation([], state['observation'], self.tp.env.observation_stack_size) - - self.curr_state = state - if self.tp.agent.use_measurements: - # TODO: this should be handled in the environment - self.curr_state['measurements'] = self.env.measurements - - if self.tp.agent.use_accumulated_reward_as_measurement: - self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) + action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0} if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: action = self.env.get_random_action() @@ -409,8 +407,10 @@ class Agent(object): # initialize the next state # TODO: provide option to stack more than just the observation - next_state['observation'] = stack_observation(self.curr_state['observation'], next_state['observation'], self.tp.env.observation_stack_size) + self.curr_stack.append(next_state['observation']) + observation = LazyStack(self.curr_stack, -1) + next_state['observation'] = observation if self.tp.agent.use_measurements and 'measurements' in result.keys(): next_state['measurements'] = result['state']['measurements'] if self.tp.agent.use_accumulated_reward_as_measurement: @@ -516,6 +516,7 @@ class Agent(object): self.exploration_policy.change_phase(RunPhase.TRAIN) training_start_time = time.time() model_snapshots_periods_passed = -1 + self.reset_game() while self.training_iteration < self.tp.num_training_iterations: # evaluate @@ -526,7 +527,7 @@ class Agent(object): self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0) if evaluate_agent: - self.env.reset() + self.env.reset(force_environment_reset=True) self.last_episode_evaluation_ran = self.current_episode self.evaluate(self.tp.evaluation_episodes) diff --git a/agents/nec_agent.py b/agents/nec_agent.py index c123458..77520a4 100644 --- a/agents/nec_agent.py +++ b/agents/nec_agent.py @@ -27,10 +27,7 @@ class NECAgent(ValueOptimizationAgent): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=False) self.current_episode_state_embeddings = [] - self.current_episode_actions = [] self.training_started = False - # if self.tp.checkpoint_restore_dir: - # self.load_dnd(self.tp.checkpoint_restore_dir) def learn_from_batch(self, batch): if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn): @@ -41,83 +38,57 @@ class NECAgent(ValueOptimizationAgent): screen.log_title("Finished collecting initial entries in DND. Starting to train network...") current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - result = self.main_network.train_and_sync_networks(current_states, total_return) + + TD_targets = self.main_network.online_network.predict(current_states) + + # only update the action that we have actually done in this transition + for i in range(self.tp.batch_size): + TD_targets[i, actions[i]] = total_return[i] + + # train the neural network + result = self.main_network.train_and_sync_networks(current_states, TD_targets) + total_loss = result[0] return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - """ - this method modifies the superclass's behavior in only 3 ways: + def act(self, phase=RunPhase.TRAIN): + if self.in_heatup: + # get embedding in heatup (otherwise we get it through choose_action) + embedding = self.main_network.online_network.predict( + self.tf_input_state(self.curr_state), + outputs=self.main_network.online_network.state_embedding) + self.current_episode_state_embeddings.append(embedding) - 1) the embedding is saved and stored in self.current_episode_state_embeddings - 2) the dnd output head is only called if it has a minimum number of entries in it - ideally, the dnd had would do this on its own, but in my attempt in encoding this - behavior in tensorflow, I ran into problems. Would definitely be worth - revisiting in the future - 3) during training, actions are saved and stored in self.current_episode_actions - if behaviors 1 and 2 were handled elsewhere, this could easily be implemented - as a wrapper around super instead of overriding this method entirelysearch - """ + return super().act(phase) - # get embedding - embedding = self.main_network.online_network.predict( + def get_prediction(self, curr_state): + # get the actions q values and the state embedding + embedding, actions_q_values = self.main_network.online_network.predict( self.tf_input_state(curr_state), - outputs=self.main_network.online_network.state_embedding) - self.current_episode_state_embeddings.append(embedding) + outputs=[self.main_network.online_network.state_embedding, + self.main_network.online_network.output_heads[0].output] + ) - # TODO: support additional heads. Right now all other heads are ignored - if self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn): - # if there are enough entries in the DND then we can query it to get the action values - # actions_q_values = [] - feed_dict = { - self.main_network.online_network.state_embedding: [embedding], - } - actions_q_values = self.main_network.sess.run( - self.main_network.online_network.output_heads[0].output, feed_dict=feed_dict) - else: - # get only the embedding so we can insert it to the DND - actions_q_values = [0] * self.action_space_size - - # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(actions_q_values) - # NOTE: this next line is not in the parent implementation - # NOTE: it could be implemented as a wrapper around the parent since action is returned - self.current_episode_actions.append(action) - else: - action = np.argmax(actions_q_values) - - # store the q values statistics for logging - self.q_values.add_sample(actions_q_values) - - # store information for plotting interactively (actual plotting is done in agent) - if self.tp.visualization.plot_action_values_online: - for idx, action_name in enumerate(self.env.actions_description): - self.episode_running_info[action_name].append(actions_q_values[idx]) - - action_value = {"action_value": actions_q_values[action]} - return action, action_value + # store the state embedding for inserting it to the DND later + self.current_episode_state_embeddings.append(embedding.squeeze()) + actions_q_values = actions_q_values[0][0] + return actions_q_values def reset_game(self, do_not_reset_env=False): - ValueOptimizationAgent.reset_game(self, do_not_reset_env) + super().reset_game(do_not_reset_env) - # make sure we already have at least one episode - if self.memory.num_complete_episodes() >= 1 and not self.in_heatup: - # get the last full episode that we have collected - episode = self.memory.get(-2) - returns = [] - for i in range(episode.length()): - returns.append(episode.get_transition(i).total_return) - # Just to deal with the end of heatup where there might be a case where it ends in a middle - # of an episode, and thus when getting the episode out of the ER, it will be a complete one whereas - # the other statistics collected here, are collected only during training. - returns = returns[-len(self.current_episode_actions):] + # get the last full episode that we have collected + episode = self.memory.get_last_complete_episode() + if episode is not None: + # the indexing is only necessary because the heatup can end in the middle of an episode + # this won't be required after fixing this so that when the heatup is ended, the episode is closed + returns = episode.get_transitions_attribute('total_return')[:len(self.current_episode_state_embeddings)] + actions = episode.get_transitions_attribute('action')[:len(self.current_episode_state_embeddings)] self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings, - self.current_episode_actions, returns) + actions, returns) self.current_episode_state_embeddings = [] - self.current_episode_actions = [] def save_model(self, model_id): self.main_network.save_model(model_id) diff --git a/agents/value_optimization_agent.py b/agents/value_optimization_agent.py index 1ad300d..75708d7 100644 --- a/agents/value_optimization_agent.py +++ b/agents/value_optimization_agent.py @@ -73,5 +73,5 @@ class ValueOptimizationAgent(Agent): for idx, action_name in enumerate(self.env.actions_description): self.episode_running_info[action_name].append(actions_q_values[idx]) - action_value = {"action_value": actions_q_values[action]} + action_value = {"action_value": actions_q_values[action], "max_action_value": np.max(actions_q_values)} return action, action_value diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py index 0867d8db..ef026e6 100644 --- a/architectures/network_wrapper.py +++ b/architectures/network_wrapper.py @@ -125,14 +125,15 @@ class NetworkWrapper(object): """ self.online_network.apply_gradients(self.online_network.accumulated_gradients) - def train_and_sync_networks(self, inputs, targets): + def train_and_sync_networks(self, inputs, targets, additional_fetches=[]): """ A generic training function that enables multi-threading training using a global network if necessary. :param inputs: The inputs for the network. :param targets: The targets corresponding to the given inputs + :param additional_fetches: Any additional tensor the user wants to fetch :return: The loss of the training iteration """ - result = self.online_network.accumulate_gradients(inputs, targets) + result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches) self.apply_gradients_and_sync_networks() return result diff --git a/architectures/tensorflow_components/general_network.py b/architectures/tensorflow_components/general_network.py index e8befc2..03bb2a9 100644 --- a/architectures/tensorflow_components/general_network.py +++ b/architectures/tensorflow_components/general_network.py @@ -56,7 +56,8 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): # the observation can be either an image or a vector def get_observation_embedding(with_timestep=False): if self.input_height > 1: - return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation") + return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", + input_rescaler=self.tp.agent.input_rescaler) else: return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") @@ -191,7 +192,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): if tuning_parameters.agent.optimizer_type == 'Adam': self.optimizer = tf.train.AdamOptimizer(learning_rate=tuning_parameters.learning_rate) elif tuning_parameters.agent.optimizer_type == 'RMSProp': - self.optimizer = tf.train.RMSPropOptimizer(self.tp.learning_rate, decay=0.9, epsilon=0.01) + self.optimizer = tf.train.RMSPropOptimizer(tuning_parameters.learning_rate, decay=0.9, epsilon=0.01) elif tuning_parameters.agent.optimizer_type == 'LBFGS': self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B', options={'maxiter': 25}) diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py index 2653d59..616ab13 100644 --- a/architectures/tensorflow_components/heads.py +++ b/architectures/tensorflow_components/heads.py @@ -57,7 +57,8 @@ class Head(object): self.loss = force_list(self.loss) self.regularizations = force_list(self.regularizations) if self.is_local: - self.set_loss() + self.set_loss() + self._post_build() if self.is_local: return self.output, self.target, self.input @@ -76,6 +77,14 @@ class Head(object): """ pass + def _post_build(self): + """ + Optional function that allows adding any extra definitions after the head has been fully defined + For example, this allows doing additional calculations that are based on the loss + :return: None + """ + pass + def get_name(self): """ Get a formatted name for the module @@ -271,6 +280,9 @@ class DNDQHead(Head): else: self.loss_type = tf.losses.mean_squared_error self.tp = tuning_parameters + self.dnd_embeddings = [None]*self.num_actions + self.dnd_values = [None]*self.num_actions + self.dnd_indices = [None]*self.num_actions def _build_module(self, input_layer): # DND based Q head @@ -281,29 +293,29 @@ class DNDQHead(Head): else: self.DND = differentiable_neural_dictionary.QDND( self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient, - key_error_threshold=self.DND_key_error_threshold) + key_error_threshold=self.DND_key_error_threshold, learning_rate=self.tp.learning_rate) # Retrieve info from DND dictionary - # self.action = tf.placeholder(tf.int8, [None], name="action") - # self.input = self.action - self.output = [ + # We assume that all actions have enough entries in the DND + self.output = tf.transpose([ self._q_value(input_layer, action) for action in range(self.num_actions) - ] + ]) def _q_value(self, input_layer, action): result = tf.py_func(self.DND.query, - [input_layer, [action], self.number_of_nn], - [tf.float64, tf.float64]) - dnd_embeddings = tf.to_float(result[0]) - dnd_values = tf.to_float(result[1]) + [input_layer, action, self.number_of_nn], + [tf.float64, tf.float64, tf.int64]) + self.dnd_embeddings[action] = tf.to_float(result[0]) + self.dnd_values[action] = tf.to_float(result[1]) + self.dnd_indices[action] = result[2] # DND calculation - square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1)) + square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1)) distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta] weights = 1.0 / distances normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True) - return tf.reduce_sum(dnd_values * normalised_weights, axis=1) + return tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1) class NAFHead(Head): diff --git a/benchmarks/README.md b/benchmarks/README.md index d979cff..ba237e7 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -116,6 +116,14 @@ python3 coach.py -p Doom_Health_MMC -r ## NEC +## Pong_NEC + +```bash +python3 coach.py -p Pong_NEC -r +``` + +Pong_NEC + ## Doom_Basic_NEC ```bash diff --git a/benchmarks/img/Pong_NEC.png b/benchmarks/img/Pong_NEC.png new file mode 100644 index 0000000..4148669 Binary files /dev/null and b/benchmarks/img/Pong_NEC.png differ diff --git a/coach.py b/coach.py index 74882bb..8ba8cf3 100644 --- a/coach.py +++ b/coach.py @@ -34,11 +34,6 @@ import sys import subprocess from threading import Thread -try: - from Queue import Queue, Empty -except ImportError: - from queue import Queue, Empty # for Python 3.x - if len(set(failed_imports)) > 0: screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) @@ -258,7 +253,8 @@ if __name__ == "__main__": # dump documentation logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) if not args.no_summary: - atexit.register(logger.print_summary) + atexit.register(logger.summarize_experiment) + screen.change_terminal_title(logger.experiment_name) # Single-threaded runs if run_dict['num_threads'] == 1: @@ -300,7 +296,7 @@ if __name__ == "__main__": "--worker_hosts={}".format(worker_hosts), "--job_name=ps", ] - parameter_server = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) + parameter_server = Popen(cmd) screen.log_title("*** Distributed Training ***") time.sleep(1) @@ -325,7 +321,7 @@ if __name__ == "__main__": "--job_name=worker", "--load_json={}".format(json_run_dict_path)] - p = Popen(workers_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) + p = Popen(workers_args) if i != run_dict['num_threads']: workers.append(p) diff --git a/configurations.py b/configurations.py index ddc5ce6..5e553d8 100644 --- a/configurations.py +++ b/configurations.py @@ -115,6 +115,7 @@ class AgentParameters(Parameters): replace_mse_with_huber_loss = False load_memory_from_file_path = None collect_new_data = True + input_rescaler = 255.0 # PPO related params target_kl_divergence = 0.01 @@ -154,6 +155,8 @@ class EnvironmentParameters(Parameters): desired_observation_width = 76 desired_observation_height = 60 normalize_observation = False + crop_observation = False + random_initialization_steps = 0 reward_scaling = 1.0 reward_clipping_min = None reward_clipping_max = None @@ -290,6 +293,8 @@ class Atari(EnvironmentParameters): desired_observation_width = 84 reward_clipping_max = 1.0 reward_clipping_min = -1.0 + random_initialization_steps = 30 + crop_observation = False # in the original paper the observation is cropped but not in the Nature paper class Doom(EnvironmentParameters): @@ -355,6 +360,7 @@ class DQN(AgentParameters): class DDQN(DQN): type = 'DDQNAgent' + num_steps_between_copying_online_weights_to_target = 30000 class DuelingDQN(DQN): @@ -384,17 +390,19 @@ class QuantileRegressionDQN(DQN): class NEC(AgentParameters): type = 'NECAgent' - optimizer_type = 'RMSProp' + optimizer_type = 'Adam' input_types = {'observation': InputTypes.Observation} output_types = [OutputTypes.DNDQ] loss_weights = [1.0] dnd_size = 500000 l2_norm_added_delta = 0.001 - new_value_shift_coefficient = 0.1 + new_value_shift_coefficient = 0.1 # alpha number_of_knn = 50 n_step = 100 bootstrap_total_return_from_old_policy = True - DND_key_error_threshold = 0.1 + DND_key_error_threshold = 0 + input_rescaler = 1.0 + num_consecutive_playing_steps = 4 class ActorCritic(AgentParameters): diff --git a/environments/gym_environment_wrapper.py b/environments/gym_environment_wrapper.py index 6c94aaa..c821bf8 100644 --- a/environments/gym_environment_wrapper.py +++ b/environments/gym_environment_wrapper.py @@ -19,6 +19,7 @@ from logger import * import gym import numpy as np import time +import random try: import roboschool from OpenGL import GL @@ -59,7 +60,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper): # self.env_spec = gym.spec(self.env_id) self.env.frameskip = self.frame_skip self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box - + self.random_initialization_steps = 0 self.state = self.reset(True)['state'] # render @@ -113,6 +114,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper): else: self.timestep_limit = None self.measurements_size = len(self.step(0)['info'].keys()) + self.random_initialization_steps = self.tp.env.random_initialization_steps def _wrap_state(self, state): if isinstance(self.env.observation_space, gym.spaces.Dict): @@ -155,8 +157,9 @@ class GymEnvironmentWrapper(EnvironmentWrapper): def _preprocess_state(self, state): # TODO: move this into wrapper - if any(env in self.env_id for env in ["Breakout", "Pong"]): - # crop image + # crop image for atari games + # the image from the environment is 210x160 + if self.tp.env.crop_observation and hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'): state['observation'] = state['observation'][34:195, :, :] return state @@ -170,7 +173,16 @@ class GymEnvironmentWrapper(EnvironmentWrapper): self.env.seed(self.seed) self.state = self._wrap_state(self.env.reset()) - while self.state is None: + + # initialize the number of lives + if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'): + self.current_ale_lives = self.env.env.ale.lives() + + # simulate a random initial environment state by stepping for a random number of times between 0 and 30 + step_count = 0 + random_initialization_steps = random.randint(0, self.random_initialization_steps) + while self.state is None or step_count < random_initialization_steps: + step_count += 1 self.step(0) return self.state diff --git a/logger.py b/logger.py index 08d026f..c070cc0 100644 --- a/logger.py +++ b/logger.py @@ -115,6 +115,14 @@ class ScreenLogger(object): if default is not None: return default + def change_terminal_title(self, title: str): + """ + Changes the title of the terminal window + :param title: The new title + :return: None + """ + print("\x1b]2;{}\x07".format(title)) + class BaseLogger(object): def __init__(self): @@ -157,6 +165,7 @@ class Logger(BaseLogger): self.time = None self.experiments_path = "" self.last_line_idx_written_to_csv = 0 + self.experiment_name = "" def set_current_time(self, time): self.time = time @@ -205,7 +214,9 @@ class Logger(BaseLogger): def signal_value_exists(self, time, signal_name): try: - self.get_signal_value(time, signal_name) + value = self.get_signal_value(time, signal_name) + if value != value: # value is nan + return False except: return False return True @@ -229,7 +240,8 @@ class Logger(BaseLogger): if self.start_time: self.create_signal_value('Wall-Clock Time', time.time() - self.start_time, time=episode) else: - self.create_signal_value('Wall-Clock Time', time.time(), time=episode) + self.create_signal_value('Wall-Clock Time', 0, time=episode) + self.start_time = time.time() def create_gif(self, images, fps=10, name="Gif"): output_file = '{}_{}.gif'.format(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), name) @@ -243,7 +255,7 @@ class Logger(BaseLogger): def remove_experiment_dir(self): shutil.rmtree(self.experiments_path) - def print_summary(self): + def summarize_experiment(self): screen.separator() screen.log_title("Results stored at: {}".format(self.experiments_path)) screen.log_title("Total runtime: {}".format(datetime.datetime.now() - self.time_started)) @@ -273,7 +285,8 @@ class Logger(BaseLogger): screen.error('Experiment name must be composed only of alphanumeric letters, ' 'underscores and dashes and should not be longer than 100 characters.') - return match.group(0) + self.experiment_name = match.group(0) + return self.experiment_name def get_experiment_path(self, experiment_name, create_path=True): general_experiments_path = os.path.join('./experiments/', experiment_name) diff --git a/memories/differentiable_neural_dictionary.py b/memories/differentiable_neural_dictionary.py index 4904151..1a1fdc7 100644 --- a/memories/differentiable_neural_dictionary.py +++ b/memories/differentiable_neural_dictionary.py @@ -83,6 +83,11 @@ class AnnoyDictionary(object): # Returns the stored embeddings and values of the closest embeddings def query(self, keys, k): + if not self.has_enough_entries(k): + # this will only happen when the DND is not yet populated with enough entries, which is only during heatup + # these values won't be used and therefore they are meaningless + return [0.0], [0.0], [0] + _, indices = self._get_k_nearest_neighbors_indices(keys, k) embeddings = [] @@ -94,7 +99,7 @@ class AnnoyDictionary(object): self.current_timestamp += 1 - return embeddings, values + return embeddings, values, indices def has_enough_entries(self, k): return self.curr_size > k and (self.built_capacity > k) @@ -133,9 +138,11 @@ class AnnoyDictionary(object): class QDND: - def __init__(self, dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01): + def __init__(self, dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01, + learning_rate=0.01): self.num_actions = num_actions self.dicts = [] + self.learning_rate = learning_rate # create a dict for each action for a in range(num_actions): @@ -155,16 +162,18 @@ class QDND: self.dicts[a].add(curr_action_embeddings, curr_action_values) return True - def query(self, embeddings, actions, k): + def query(self, embeddings, action, k): # query for nearest neighbors to the given embeddings dnd_embeddings = [] dnd_values = [] - for i, action in enumerate(actions): - embedding, value = self.dicts[action].query([embeddings[i]], k) + dnd_indices = [] + for i in range(len(embeddings)): + embedding, value, indices = self.dicts[action].query([embeddings[i]], k) dnd_embeddings.append(embedding[0]) dnd_values.append(value[0]) + dnd_indices.append(indices[0]) - return dnd_embeddings, dnd_values + return dnd_embeddings, dnd_values, dnd_indices def has_enough_entries(self, k): # check if each of the action dictionaries has at least k entries @@ -193,4 +202,5 @@ def load_dnd(model_dir): DND.dicts[a].index.add_item(idx, key) DND.dicts[a].index.build(50) + return DND diff --git a/memories/episodic_experience_replay.py b/memories/episodic_experience_replay.py index 1d7e61f..5930d78 100644 --- a/memories/episodic_experience_replay.py +++ b/memories/episodic_experience_replay.py @@ -16,6 +16,7 @@ from memories.memory import * import threading +from typing import Union class EpisodicExperienceReplay(Memory): @@ -103,7 +104,8 @@ class EpisodicExperienceReplay(Memory): if transition.game_over: self._num_transitions_in_complete_episodes += last_episode.length() self._length += 1 - self.buffer[-1].update_returns(self.discount, is_bootstrapped=self.return_is_bootstrapped, + self.buffer[-1].update_returns(self.discount, + is_bootstrapped=self.tp.agent.bootstrap_total_return_from_old_policy, n_step_return=self.tp.agent.n_step) self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead) # self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization @@ -146,6 +148,17 @@ class EpisodicExperienceReplay(Memory): def get(self, index): return self.get_episode(index) + def get_last_complete_episode(self) -> Union[None, Episode]: + """ + Returns the last complete episode in the memory or None if there are no complete episodes + :return: None or the last complete episode + """ + last_complete_episode_index = self.num_complete_episodes()-1 + if last_complete_episode_index >= 0: + return self.get(last_complete_episode_index) + else: + return None + def update_last_transition_info(self, info): episode = self.buffer[-1] if episode.length() == 0: diff --git a/memories/memory.py b/memories/memory.py index 4c479e3..f4c6a87 100644 --- a/memories/memory.py +++ b/memories/memory.py @@ -80,9 +80,12 @@ class Episode(object): total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0) current_discount *= discount + # calculate the bootstrapped returns + bootstraps = np.array([np.squeeze(t.info['max_action_value']) for t in self.transitions[n_step_return:]]) + bootstrapped_return = total_return + current_discount * np.pad(bootstraps, (0, n_step_return), 'constant', + constant_values=0) if is_bootstrapped: - bootstraps = np.array([np.squeeze(t.info['action_value']) for t in self.transitions[n_step_return:]]) - total_return += current_discount * np.pad(bootstraps, (0, n_step_return), 'constant', constant_values=0) + total_return = bootstrapped_return for transition_idx in range(self.length()): self.transitions[transition_idx].total_return = total_return[transition_idx] @@ -114,7 +117,13 @@ class Episode(object): return self.returns_table def get_returns(self): - return [t.total_return for t in self.transitions] + return self.get_transitions_attribute('total_return') + + def get_transitions_attribute(self, attribute_name): + if hasattr(self.transitions[0], attribute_name): + return [t.__dict__[attribute_name] for t in self.transitions] + else: + raise ValueError("The transitions have no such attribute name") def to_batch(self): batch = [] @@ -141,14 +150,12 @@ class Transition(object): :param game_over: A boolean which should be True if the episode terminated after the execution of the action. """ - self.state = copy.deepcopy(state) - self.state['observation'] = np.array(self.state['observation'], copy=False) + self.state = state self.action = action self.reward = reward self.total_return = None if not next_state: next_state = state - self.next_state = copy.deepcopy(next_state) - self.next_state['observation'] = np.array(self.next_state['observation'], copy=False) + self.next_state = next_state self.game_over = game_over self.info = {} diff --git a/plot_atari.py b/plot_atari.py new file mode 100644 index 0000000..8732fb0 --- /dev/null +++ b/plot_atari.py @@ -0,0 +1,105 @@ +import argparse +import matplotlib +import matplotlib.pyplot as plt +from dashboard import SignalsFile +import os + + +class FigureMaker(object): + def __init__(self, path, cols, smoothness, signal_to_plot, x_axis): + self.experiments_path = path + self.environments = self.list_environments() + self.cols = cols + self.rows = int((len(self.environments) + cols - 1) / cols) + self.smoothness = smoothness + self.signal_to_plot = signal_to_plot + self.x_axis = x_axis + + params = { + 'axes.labelsize': 8, + 'font.size': 10, + 'legend.fontsize': 14, + 'xtick.labelsize': 8, + 'ytick.labelsize': 8, + 'text.usetex': False, + 'figure.figsize': [16, 30] + } + matplotlib.rcParams.update(params) + + def list_environments(self): + environments = sorted([e.name for e in os.scandir(args.path) if e.is_dir()]) + filtered_environments = self.filter_environments(environments) + return filtered_environments + + def filter_environments(self, environments): + filtered_environments = [] + for idx, environment in enumerate(environments): + path = os.path.join(args.path, environment) + experiments = [e.name for e in os.scandir(path) if e.is_dir()] + + # take only the last updated experiment directory + last_experiment_dir = max([os.path.join(path, root) for root in experiments], key=os.path.getctime) + + # make sure there is a csv file inside it + for file_path in os.listdir(last_experiment_dir): + full_file_path = os.path.join(last_experiment_dir, file_path) + if os.path.isfile(full_file_path) and file_path.endswith('.csv'): + filtered_environments.append((environment, full_file_path)) + + return filtered_environments + + def plot_figures(self): + for idx, (environment, full_file_path) in enumerate(self.environments): + print(environment) + axis = plt.subplot(self.rows, self.cols, idx + 1) + signals = SignalsFile(full_file_path) + signals.change_averaging_window(self.smoothness, force=True, signals=[self.signal_to_plot]) + steps = signals.bokeh_source.data[self.x_axis] + rewards = signals.bokeh_source.data[self.signal_to_plot] + + yloc = plt.MaxNLocator(4) + axis.yaxis.set_major_locator(yloc) + axis.ticklabel_format(style='sci', axis='x', scilimits=(0, 0)) + plt.title(environment, fontsize=10, y=1.08) + plt.plot(steps, rewards, linewidth=0.8) + plt.subplots_adjust(hspace=2.0, wspace=0.4) + + def save_pdf(self, name): + plt.savefig(name + ".pdf", bbox_inches='tight') + + def show_figures(self): + plt.show() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--path', + help="(string) Root directory of the experiments", + default=None, + type=str) + parser.add_argument('-c', '--cols', + help="(int) Number of plot columns", + default=6, + type=int) + parser.add_argument('-s', '--smoothness', + help="(int) Number of consequent episodes to average over", + default=200, + type=int) + parser.add_argument('-sig', '--signal', + help="(str) The name of the signal to plot", + default='Evaluation Reward', + type=str) + parser.add_argument('-x', '--x_axis', + help="(str) The meaning of the x axis", + default='Total steps', + type=str) + parser.add_argument('-pdf', '--pdf', + help="(str) A name of a pdf to save to", + default='atari', + type=str) + args = parser.parse_args() + + maker = FigureMaker(args.path, cols=args.cols, smoothness=args.smoothness, signal_to_plot=args.signal, x_axis=args.x_axis) + maker.plot_figures() + maker.save_pdf(args.pdf) + maker.show_figures() diff --git a/presets.py b/presets.py index c0225ad..dcfb765 100644 --- a/presets.py +++ b/presets.py @@ -89,7 +89,6 @@ class Doom_Basic_QRDQN(Preset): self.num_heatup_steps = 1000 - class Doom_Basic_OneStepQ(Preset): def __init__(self): Preset.__init__(self, NStepQ, Doom, ExplorationParameters) @@ -408,8 +407,67 @@ class Breakout_DQN(Preset): self.exploration.evaluation_policy = 'EGreedy' self.exploration.evaluation_epsilon = 0.05 self.num_heatup_steps = 50000 + self.agent.num_consecutive_playing_steps = 4 self.evaluation_episodes = 1 - self.evaluate_every_x_episodes = 100 + self.evaluate_every_x_episodes = 25 + self.agent.replace_mse_with_huber_loss = True + # self.env.crop_observation = True # TODO: remove + # self.rescaling_interpolation_type = 'nearest' # TODO: remove + + +class Breakout_DDQN(Preset): + def __init__(self): + Preset.__init__(self, DDQN, Atari, ExplorationParameters) + self.env.level = 'BreakoutDeterministic-v4' + self.agent.num_steps_between_copying_online_weights_to_target = 30000 + self.learning_rate = 0.00025 + self.agent.num_transitions_in_experience_replay = 1000000 + self.exploration.initial_epsilon = 1.0 + self.exploration.final_epsilon = 0.01 + self.exploration.epsilon_decay_steps = 1000000 + self.exploration.evaluation_policy = 'EGreedy' + self.exploration.evaluation_epsilon = 0.001 + self.num_heatup_steps = 50000 + self.agent.num_consecutive_playing_steps = 4 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 25 + self.agent.replace_mse_with_huber_loss = True + + +class Breakout_Dueling_DDQN(Preset): + def __init__(self): + Preset.__init__(self, DDQN, Atari, ExplorationParameters) + self.env.level = 'BreakoutDeterministic-v4' + self.agent.output_types = [OutputTypes.DuelingQ] + self.agent.num_steps_between_copying_online_weights_to_target = 30000 + self.learning_rate = 0.00025 + self.agent.num_transitions_in_experience_replay = 1000000 + self.exploration.initial_epsilon = 1.0 + self.exploration.final_epsilon = 0.01 + self.exploration.epsilon_decay_steps = 1000000 + self.exploration.evaluation_policy = 'EGreedy' + self.exploration.evaluation_epsilon = 0.001 + self.num_heatup_steps = 50000 + self.agent.num_consecutive_playing_steps = 4 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 25 + self.agent.replace_mse_with_huber_loss = True + +class Alien_DQN(Preset): + def __init__(self): + Preset.__init__(self, DQN, Atari, ExplorationParameters) + self.env.level = 'AlienDeterministic-v4' + self.agent.num_steps_between_copying_online_weights_to_target = 10000 + self.learning_rate = 0.00025 + self.agent.num_transitions_in_experience_replay = 1000000 + self.exploration.initial_epsilon = 1.0 + self.exploration.final_epsilon = 0.1 + self.exploration.epsilon_decay_steps = 1000000 + self.exploration.evaluation_policy = 'EGreedy' + self.exploration.evaluation_epsilon = 0.05 + self.num_heatup_steps = 50000 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 5 class Breakout_C51(Preset): @@ -846,7 +904,8 @@ class CartPole_NEC(Preset): self.num_heatup_steps = 1000 self.exploration.epsilon_decay_steps = 1000 self.exploration.final_epsilon = 0.1 - self.agent.discount = 1.0 + self.agent.discount = 0.99 + self.seed = 0 self.test = True self.test_max_step_threshold = 200 @@ -857,10 +916,16 @@ class Doom_Basic_NEC(Preset): def __init__(self): Preset.__init__(self, NEC, Doom, ExplorationParameters) self.env.level = 'basic' - self.agent.num_episodes_in_experience_replay = 200 - self.learning_rate = 0.00025 - self.num_heatup_steps = 1000 - self.agent.num_playing_steps_between_two_training_steps = 1 + self.learning_rate = 0.00001 + self.agent.num_transitions_in_experience_replay = 100000 + # self.exploration.initial_epsilon = 0.1 # TODO: try exploration + # self.exploration.final_epsilon = 0.1 + # self.exploration.epsilon_decay_steps = 1000000 + self.num_heatup_steps = 200 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 5 + self.seed = 123 + class Montezuma_NEC(Preset): @@ -877,12 +942,20 @@ class Breakout_NEC(Preset): def __init__(self): Preset.__init__(self, NEC, Atari, ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' - self.learning_rate = 0.00025 + self.agent.num_steps_between_copying_online_weights_to_target = 10000 + self.learning_rate = 0.00001 self.agent.num_transitions_in_experience_replay = 1000000 - self.exploration.initial_epsilon = 1.0 + self.exploration.initial_epsilon = 0.1 self.exploration.final_epsilon = 0.1 self.exploration.epsilon_decay_steps = 1000000 - self.num_heatup_steps = 50000 + self.exploration.evaluation_policy = 'EGreedy' + self.exploration.evaluation_epsilon = 0.05 + self.num_heatup_steps = 1000 + self.env.reward_clipping_max = None + self.env.reward_clipping_min = None + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 25 + self.seed = 123 class Doom_Health_NEC(Preset): @@ -924,12 +997,54 @@ class Pong_NEC(Preset): def __init__(self): Preset.__init__(self, NEC, Atari, ExplorationParameters) self.env.level = 'PongDeterministic-v4' - self.learning_rate = 0.001 + self.learning_rate = 0.00001 self.agent.num_transitions_in_experience_replay = 100000 - self.exploration.initial_epsilon = 0.5 + self.exploration.initial_epsilon = 0.1 # TODO: try exploration self.exploration.final_epsilon = 0.1 self.exploration.epsilon_decay_steps = 1000000 + self.num_heatup_steps = 2000 + self.env.reward_clipping_max = None + self.env.reward_clipping_min = None + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 5 + self.env.crop_observation = True # TODO: remove + self.env.random_initialization_steps = 1 # TODO: remove + # self.seed = 123 + + +class Alien_NEC(Preset): + def __init__(self): + Preset.__init__(self, NEC, Atari, ExplorationParameters) + self.env.level = 'AlienDeterministic-v4' + self.learning_rate = 0.0001 + self.agent.num_transitions_in_experience_replay = 100000 + self.exploration.initial_epsilon = 0.1 # TODO: try exploration + self.exploration.final_epsilon = 0.1 + self.exploration.epsilon_decay_steps = 1000000 + self.num_heatup_steps = 3000 + self.env.reward_clipping_max = None + self.env.reward_clipping_min = None + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 5 + self.seed = 123 + + +class Pong_DQN(Preset): + def __init__(self): + Preset.__init__(self, DQN, Atari, ExplorationParameters) + self.env.level = 'PongDeterministic-v4' + self.agent.num_steps_between_copying_online_weights_to_target = 10000 + self.learning_rate = 0.00025 + self.agent.num_transitions_in_experience_replay = 1000000 + self.exploration.initial_epsilon = 1.0 + self.exploration.final_epsilon = 0.1 + self.exploration.epsilon_decay_steps = 1000000 + self.exploration.evaluation_policy = 'EGreedy' + self.exploration.evaluation_epsilon = 0.05 self.num_heatup_steps = 50000 + self.evaluation_episodes = 1 + self.evaluate_every_x_episodes = 5 + self.seed = 123 class CartPole_A3C(Preset): diff --git a/run_test.py b/run_test.py index 7e8f6ff..196056e 100644 --- a/run_test.py +++ b/run_test.py @@ -50,6 +50,9 @@ if __name__ == '__main__': parser.add_argument('-v', '--verbose', help="(flag) display verbose logs in the event of an error", action='store_true') + parser.add_argument('-l', '--list_presets', + help="(flag) list all the presets that are tested", + action='store_true') parser.add_argument('--stop_after_first_failure', help="(flag) stop executing tests after the first error", action='store_true') @@ -73,6 +76,14 @@ if __name__ == '__main__': presets_to_ignore = args.ignore_presets.split(',') else: presets_to_ignore = [] + + if args.list_presets: + for idx, preset_name in enumerate(presets_lists): + preset = eval('presets.{}()'.format(preset_name)) + if preset.test and preset_name not in presets_to_ignore: + print(preset_name) + exit(0) + for idx, preset_name in enumerate(presets_lists): preset = eval('presets.{}()'.format(preset_name)) if preset.test and preset_name not in presets_to_ignore: diff --git a/utils.py b/utils.py index 4eecbce..7f75ac5 100644 --- a/utils.py +++ b/utils.py @@ -21,6 +21,7 @@ import numpy as np import threading from subprocess import call, Popen import signal +import copy killed_processes = [] @@ -333,6 +334,23 @@ def switch_axes_order(observation, from_type='channels_first', to_type='channels return np.transpose(observation, (1, 0)) +class LazyStack(object): + """ + A lazy version of np.stack which avoids copying the memory until it is + needed. + """ + + def __init__(self, history, axis=None): + self.history = copy.copy(history) + self.axis = axis + + def __array__(self, dtype=None): + array = np.stack(self.history, axis=self.axis) + if dtype is not None: + array = array.astype(dtype) + return array + + def stack_observation(curr_stack, observation, stack_size): """ Adds a new observation to an existing stack of observations from previous time-steps.