1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 11:10:20 +01:00

Multiple improvements and bug fixes (#66)

* Multiple improvements and bug fixes:

    * Using lazy stacking to save on memory when using a replay buffer
    * Remove step counting for evaluation episodes
    * Reset game between heatup and training
    * Major bug fixes in NEC (is reproducing the paper results for pong now)
    * Image input rescaling to 0-1 is now optional
    * Change the terminal title to be the experiment name
    * Observation cropping for atari is now optional
    * Added random number of noop actions for gym to match the dqn paper
    * Fixed a bug where the evaluation episodes won't start with the max possible ale lives
    * Added a script for plotting the results of an experiment over all the atari games
This commit is contained in:
Itai Caspi
2018-02-26 12:29:07 +02:00
committed by GitHub
parent 4fe9cba445
commit a7206ed702
20 changed files with 465 additions and 158 deletions

5
.gitignore vendored
View File

@@ -15,3 +15,8 @@ roboschool
*.orig *.orig
docs/site docs/site
coach_env coach_env
build
rl_coach.egg*
contrib
test_log_*
dist

View File

@@ -24,6 +24,8 @@ except:
import copy import copy
from renderer import Renderer from renderer import Renderer
from configurations import Preset from configurations import Preset
from collections import deque
from utils import LazyStack
from collections import OrderedDict from collections import OrderedDict
from utils import RunPhase, Signal, is_empty, RunningStat from utils import RunPhase, Signal, is_empty, RunningStat
from architectures import * from architectures import *
@@ -214,6 +216,8 @@ class Agent(object):
network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init
self.prepare_initial_state()
def preprocess_observation(self, observation): def preprocess_observation(self, observation):
""" """
Preprocesses the given observation. Preprocesses the given observation.
@@ -291,9 +295,8 @@ class Agent(object):
""" """
current_states = {} current_states = {}
next_states = {} next_states = {}
current_states['observation'] = np.array([np.array(transition.state['observation']) for transition in batch])
current_states['observation'] = np.array([transition.state['observation'] for transition in batch]) next_states['observation'] = np.array([np.array(transition.next_state['observation']) for transition in batch])
next_states['observation'] = np.array([transition.next_state['observation'] for transition in batch])
actions = np.array([transition.action for transition in batch]) actions = np.array([transition.action for transition in batch])
rewards = np.array([transition.reward for transition in batch]) rewards = np.array([transition.reward for transition in batch])
game_overs = np.array([transition.game_over for transition in batch]) game_overs = np.array([transition.game_over for transition in batch])
@@ -349,6 +352,23 @@ class Agent(object):
input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0) input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0)
return input_state return input_state
def prepare_initial_state(self):
"""
Create an initial state when starting a new episode
:return: None
"""
observation = self.preprocess_observation(self.env.state['observation'])
self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
observation = LazyStack(self.curr_stack, -1)
self.curr_state = {
'observation': observation
}
if self.tp.agent.use_measurements:
self.curr_state['measurements'] = self.env.measurements
if self.tp.agent.use_accumulated_reward_as_measurement:
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
def act(self, phase=RunPhase.TRAIN): def act(self, phase=RunPhase.TRAIN):
""" """
Take one step in the environment according to the network prediction and store the transition in memory Take one step in the environment according to the network prediction and store the transition in memory
@@ -356,34 +376,12 @@ class Agent(object):
:return: A boolean value that signals an episode termination :return: A boolean value that signals an episode termination
""" """
if phase != RunPhase.TEST:
self.total_steps_counter += 1 self.total_steps_counter += 1
self.current_episode_steps_counter += 1 self.current_episode_steps_counter += 1
# get new action # get new action
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0} action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0}
is_first_transition_in_episode = (self.curr_state == {})
if is_first_transition_in_episode:
if not isinstance(self.env.state, dict):
raise ValueError((
'expected state to be a dictionary, found {}'
).format(type(self.env.state)))
state = self.env.state
# TODO: modify preprocess_observation to modify the entire state
# for now, only preprocess the observation
state['observation'] = self.preprocess_observation(state['observation'])
# TODO: provide option to stack more than just the observation
# TODO: this should probably be happening in an environment wrapper anyway
state['observation'] = stack_observation([], state['observation'], self.tp.env.observation_stack_size)
self.curr_state = state
if self.tp.agent.use_measurements:
# TODO: this should be handled in the environment
self.curr_state['measurements'] = self.env.measurements
if self.tp.agent.use_accumulated_reward_as_measurement:
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
action = self.env.get_random_action() action = self.env.get_random_action()
@@ -409,8 +407,10 @@ class Agent(object):
# initialize the next state # initialize the next state
# TODO: provide option to stack more than just the observation # TODO: provide option to stack more than just the observation
next_state['observation'] = stack_observation(self.curr_state['observation'], next_state['observation'], self.tp.env.observation_stack_size) self.curr_stack.append(next_state['observation'])
observation = LazyStack(self.curr_stack, -1)
next_state['observation'] = observation
if self.tp.agent.use_measurements and 'measurements' in result.keys(): if self.tp.agent.use_measurements and 'measurements' in result.keys():
next_state['measurements'] = result['state']['measurements'] next_state['measurements'] = result['state']['measurements']
if self.tp.agent.use_accumulated_reward_as_measurement: if self.tp.agent.use_accumulated_reward_as_measurement:
@@ -516,6 +516,7 @@ class Agent(object):
self.exploration_policy.change_phase(RunPhase.TRAIN) self.exploration_policy.change_phase(RunPhase.TRAIN)
training_start_time = time.time() training_start_time = time.time()
model_snapshots_periods_passed = -1 model_snapshots_periods_passed = -1
self.reset_game()
while self.training_iteration < self.tp.num_training_iterations: while self.training_iteration < self.tp.num_training_iterations:
# evaluate # evaluate
@@ -526,7 +527,7 @@ class Agent(object):
self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0) self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0)
if evaluate_agent: if evaluate_agent:
self.env.reset() self.env.reset(force_environment_reset=True)
self.last_episode_evaluation_ran = self.current_episode self.last_episode_evaluation_ran = self.current_episode
self.evaluate(self.tp.evaluation_episodes) self.evaluate(self.tp.evaluation_episodes)

View File

@@ -27,10 +27,7 @@ class NECAgent(ValueOptimizationAgent):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=False) create_target_network=False)
self.current_episode_state_embeddings = [] self.current_episode_state_embeddings = []
self.current_episode_actions = []
self.training_started = False self.training_started = False
# if self.tp.checkpoint_restore_dir:
# self.load_dnd(self.tp.checkpoint_restore_dir)
def learn_from_batch(self, batch): def learn_from_batch(self, batch):
if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn): if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
@@ -41,83 +38,57 @@ class NECAgent(ValueOptimizationAgent):
screen.log_title("Finished collecting initial entries in DND. Starting to train network...") screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
result = self.main_network.train_and_sync_networks(current_states, total_return)
TD_targets = self.main_network.online_network.predict(current_states)
# only update the action that we have actually done in this transition
for i in range(self.tp.batch_size):
TD_targets[i, actions[i]] = total_return[i]
# train the neural network
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
total_loss = result[0] total_loss = result[0]
return total_loss return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN): def act(self, phase=RunPhase.TRAIN):
""" if self.in_heatup:
this method modifies the superclass's behavior in only 3 ways: # get embedding in heatup (otherwise we get it through choose_action)
1) the embedding is saved and stored in self.current_episode_state_embeddings
2) the dnd output head is only called if it has a minimum number of entries in it
ideally, the dnd had would do this on its own, but in my attempt in encoding this
behavior in tensorflow, I ran into problems. Would definitely be worth
revisiting in the future
3) during training, actions are saved and stored in self.current_episode_actions
if behaviors 1 and 2 were handled elsewhere, this could easily be implemented
as a wrapper around super instead of overriding this method entirelysearch
"""
# get embedding
embedding = self.main_network.online_network.predict( embedding = self.main_network.online_network.predict(
self.tf_input_state(curr_state), self.tf_input_state(self.curr_state),
outputs=self.main_network.online_network.state_embedding) outputs=self.main_network.online_network.state_embedding)
self.current_episode_state_embeddings.append(embedding) self.current_episode_state_embeddings.append(embedding)
# TODO: support additional heads. Right now all other heads are ignored return super().act(phase)
if self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
# if there are enough entries in the DND then we can query it to get the action values
# actions_q_values = []
feed_dict = {
self.main_network.online_network.state_embedding: [embedding],
}
actions_q_values = self.main_network.sess.run(
self.main_network.online_network.output_heads[0].output, feed_dict=feed_dict)
else:
# get only the embedding so we can insert it to the DND
actions_q_values = [0] * self.action_space_size
# choose action according to the exploration policy and the current phase (evaluating or training the agent) def get_prediction(self, curr_state):
if phase == RunPhase.TRAIN: # get the actions q values and the state embedding
action = self.exploration_policy.get_action(actions_q_values) embedding, actions_q_values = self.main_network.online_network.predict(
# NOTE: this next line is not in the parent implementation self.tf_input_state(curr_state),
# NOTE: it could be implemented as a wrapper around the parent since action is returned outputs=[self.main_network.online_network.state_embedding,
self.current_episode_actions.append(action) self.main_network.online_network.output_heads[0].output]
else: )
action = np.argmax(actions_q_values)
# store the q values statistics for logging # store the state embedding for inserting it to the DND later
self.q_values.add_sample(actions_q_values) self.current_episode_state_embeddings.append(embedding.squeeze())
actions_q_values = actions_q_values[0][0]
# store information for plotting interactively (actual plotting is done in agent) return actions_q_values
if self.tp.visualization.plot_action_values_online:
for idx, action_name in enumerate(self.env.actions_description):
self.episode_running_info[action_name].append(actions_q_values[idx])
action_value = {"action_value": actions_q_values[action]}
return action, action_value
def reset_game(self, do_not_reset_env=False): def reset_game(self, do_not_reset_env=False):
ValueOptimizationAgent.reset_game(self, do_not_reset_env) super().reset_game(do_not_reset_env)
# make sure we already have at least one episode
if self.memory.num_complete_episodes() >= 1 and not self.in_heatup:
# get the last full episode that we have collected # get the last full episode that we have collected
episode = self.memory.get(-2) episode = self.memory.get_last_complete_episode()
returns = [] if episode is not None:
for i in range(episode.length()): # the indexing is only necessary because the heatup can end in the middle of an episode
returns.append(episode.get_transition(i).total_return) # this won't be required after fixing this so that when the heatup is ended, the episode is closed
# Just to deal with the end of heatup where there might be a case where it ends in a middle returns = episode.get_transitions_attribute('total_return')[:len(self.current_episode_state_embeddings)]
# of an episode, and thus when getting the episode out of the ER, it will be a complete one whereas actions = episode.get_transitions_attribute('action')[:len(self.current_episode_state_embeddings)]
# the other statistics collected here, are collected only during training.
returns = returns[-len(self.current_episode_actions):]
self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings, self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
self.current_episode_actions, returns) actions, returns)
self.current_episode_state_embeddings = [] self.current_episode_state_embeddings = []
self.current_episode_actions = []
def save_model(self, model_id): def save_model(self, model_id):
self.main_network.save_model(model_id) self.main_network.save_model(model_id)

View File

@@ -73,5 +73,5 @@ class ValueOptimizationAgent(Agent):
for idx, action_name in enumerate(self.env.actions_description): for idx, action_name in enumerate(self.env.actions_description):
self.episode_running_info[action_name].append(actions_q_values[idx]) self.episode_running_info[action_name].append(actions_q_values[idx])
action_value = {"action_value": actions_q_values[action]} action_value = {"action_value": actions_q_values[action], "max_action_value": np.max(actions_q_values)}
return action, action_value return action, action_value

View File

@@ -125,14 +125,15 @@ class NetworkWrapper(object):
""" """
self.online_network.apply_gradients(self.online_network.accumulated_gradients) self.online_network.apply_gradients(self.online_network.accumulated_gradients)
def train_and_sync_networks(self, inputs, targets): def train_and_sync_networks(self, inputs, targets, additional_fetches=[]):
""" """
A generic training function that enables multi-threading training using a global network if necessary. A generic training function that enables multi-threading training using a global network if necessary.
:param inputs: The inputs for the network. :param inputs: The inputs for the network.
:param targets: The targets corresponding to the given inputs :param targets: The targets corresponding to the given inputs
:param additional_fetches: Any additional tensor the user wants to fetch
:return: The loss of the training iteration :return: The loss of the training iteration
""" """
result = self.online_network.accumulate_gradients(inputs, targets) result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches)
self.apply_gradients_and_sync_networks() self.apply_gradients_and_sync_networks()
return result return result

View File

@@ -56,7 +56,8 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
# the observation can be either an image or a vector # the observation can be either an image or a vector
def get_observation_embedding(with_timestep=False): def get_observation_embedding(with_timestep=False):
if self.input_height > 1: if self.input_height > 1:
return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation") return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation",
input_rescaler=self.tp.agent.input_rescaler)
else: else:
return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation")
@@ -191,7 +192,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
if tuning_parameters.agent.optimizer_type == 'Adam': if tuning_parameters.agent.optimizer_type == 'Adam':
self.optimizer = tf.train.AdamOptimizer(learning_rate=tuning_parameters.learning_rate) self.optimizer = tf.train.AdamOptimizer(learning_rate=tuning_parameters.learning_rate)
elif tuning_parameters.agent.optimizer_type == 'RMSProp': elif tuning_parameters.agent.optimizer_type == 'RMSProp':
self.optimizer = tf.train.RMSPropOptimizer(self.tp.learning_rate, decay=0.9, epsilon=0.01) self.optimizer = tf.train.RMSPropOptimizer(tuning_parameters.learning_rate, decay=0.9, epsilon=0.01)
elif tuning_parameters.agent.optimizer_type == 'LBFGS': elif tuning_parameters.agent.optimizer_type == 'LBFGS':
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B', self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
options={'maxiter': 25}) options={'maxiter': 25})

View File

@@ -58,6 +58,7 @@ class Head(object):
self.regularizations = force_list(self.regularizations) self.regularizations = force_list(self.regularizations)
if self.is_local: if self.is_local:
self.set_loss() self.set_loss()
self._post_build()
if self.is_local: if self.is_local:
return self.output, self.target, self.input return self.output, self.target, self.input
@@ -76,6 +77,14 @@ class Head(object):
""" """
pass pass
def _post_build(self):
"""
Optional function that allows adding any extra definitions after the head has been fully defined
For example, this allows doing additional calculations that are based on the loss
:return: None
"""
pass
def get_name(self): def get_name(self):
""" """
Get a formatted name for the module Get a formatted name for the module
@@ -271,6 +280,9 @@ class DNDQHead(Head):
else: else:
self.loss_type = tf.losses.mean_squared_error self.loss_type = tf.losses.mean_squared_error
self.tp = tuning_parameters self.tp = tuning_parameters
self.dnd_embeddings = [None]*self.num_actions
self.dnd_values = [None]*self.num_actions
self.dnd_indices = [None]*self.num_actions
def _build_module(self, input_layer): def _build_module(self, input_layer):
# DND based Q head # DND based Q head
@@ -281,29 +293,29 @@ class DNDQHead(Head):
else: else:
self.DND = differentiable_neural_dictionary.QDND( self.DND = differentiable_neural_dictionary.QDND(
self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient, self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
key_error_threshold=self.DND_key_error_threshold) key_error_threshold=self.DND_key_error_threshold, learning_rate=self.tp.learning_rate)
# Retrieve info from DND dictionary # Retrieve info from DND dictionary
# self.action = tf.placeholder(tf.int8, [None], name="action") # We assume that all actions have enough entries in the DND
# self.input = self.action self.output = tf.transpose([
self.output = [
self._q_value(input_layer, action) self._q_value(input_layer, action)
for action in range(self.num_actions) for action in range(self.num_actions)
] ])
def _q_value(self, input_layer, action): def _q_value(self, input_layer, action):
result = tf.py_func(self.DND.query, result = tf.py_func(self.DND.query,
[input_layer, [action], self.number_of_nn], [input_layer, action, self.number_of_nn],
[tf.float64, tf.float64]) [tf.float64, tf.float64, tf.int64])
dnd_embeddings = tf.to_float(result[0]) self.dnd_embeddings[action] = tf.to_float(result[0])
dnd_values = tf.to_float(result[1]) self.dnd_values[action] = tf.to_float(result[1])
self.dnd_indices[action] = result[2]
# DND calculation # DND calculation
square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1)) square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta] distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
weights = 1.0 / distances weights = 1.0 / distances
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True) normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
return tf.reduce_sum(dnd_values * normalised_weights, axis=1) return tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
class NAFHead(Head): class NAFHead(Head):

View File

@@ -116,6 +116,14 @@ python3 coach.py -p Doom_Health_MMC -r
## NEC ## NEC
## Pong_NEC
```bash
python3 coach.py -p Pong_NEC -r
```
<img src="img/Pong_NEC.png" alt="Pong_NEC" width="400"/>
## Doom_Basic_NEC ## Doom_Basic_NEC
```bash ```bash

BIN
benchmarks/img/Pong_NEC.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -34,11 +34,6 @@ import sys
import subprocess import subprocess
from threading import Thread from threading import Thread
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty # for Python 3.x
if len(set(failed_imports)) > 0: if len(set(failed_imports)) > 0:
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
@@ -258,7 +253,8 @@ if __name__ == "__main__":
# dump documentation # dump documentation
logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True)
if not args.no_summary: if not args.no_summary:
atexit.register(logger.print_summary) atexit.register(logger.summarize_experiment)
screen.change_terminal_title(logger.experiment_name)
# Single-threaded runs # Single-threaded runs
if run_dict['num_threads'] == 1: if run_dict['num_threads'] == 1:
@@ -300,7 +296,7 @@ if __name__ == "__main__":
"--worker_hosts={}".format(worker_hosts), "--worker_hosts={}".format(worker_hosts),
"--job_name=ps", "--job_name=ps",
] ]
parameter_server = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) parameter_server = Popen(cmd)
screen.log_title("*** Distributed Training ***") screen.log_title("*** Distributed Training ***")
time.sleep(1) time.sleep(1)
@@ -325,7 +321,7 @@ if __name__ == "__main__":
"--job_name=worker", "--job_name=worker",
"--load_json={}".format(json_run_dict_path)] "--load_json={}".format(json_run_dict_path)]
p = Popen(workers_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1) p = Popen(workers_args)
if i != run_dict['num_threads']: if i != run_dict['num_threads']:
workers.append(p) workers.append(p)

View File

@@ -115,6 +115,7 @@ class AgentParameters(Parameters):
replace_mse_with_huber_loss = False replace_mse_with_huber_loss = False
load_memory_from_file_path = None load_memory_from_file_path = None
collect_new_data = True collect_new_data = True
input_rescaler = 255.0
# PPO related params # PPO related params
target_kl_divergence = 0.01 target_kl_divergence = 0.01
@@ -154,6 +155,8 @@ class EnvironmentParameters(Parameters):
desired_observation_width = 76 desired_observation_width = 76
desired_observation_height = 60 desired_observation_height = 60
normalize_observation = False normalize_observation = False
crop_observation = False
random_initialization_steps = 0
reward_scaling = 1.0 reward_scaling = 1.0
reward_clipping_min = None reward_clipping_min = None
reward_clipping_max = None reward_clipping_max = None
@@ -290,6 +293,8 @@ class Atari(EnvironmentParameters):
desired_observation_width = 84 desired_observation_width = 84
reward_clipping_max = 1.0 reward_clipping_max = 1.0
reward_clipping_min = -1.0 reward_clipping_min = -1.0
random_initialization_steps = 30
crop_observation = False # in the original paper the observation is cropped but not in the Nature paper
class Doom(EnvironmentParameters): class Doom(EnvironmentParameters):
@@ -355,6 +360,7 @@ class DQN(AgentParameters):
class DDQN(DQN): class DDQN(DQN):
type = 'DDQNAgent' type = 'DDQNAgent'
num_steps_between_copying_online_weights_to_target = 30000
class DuelingDQN(DQN): class DuelingDQN(DQN):
@@ -384,17 +390,19 @@ class QuantileRegressionDQN(DQN):
class NEC(AgentParameters): class NEC(AgentParameters):
type = 'NECAgent' type = 'NECAgent'
optimizer_type = 'RMSProp' optimizer_type = 'Adam'
input_types = {'observation': InputTypes.Observation} input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.DNDQ] output_types = [OutputTypes.DNDQ]
loss_weights = [1.0] loss_weights = [1.0]
dnd_size = 500000 dnd_size = 500000
l2_norm_added_delta = 0.001 l2_norm_added_delta = 0.001
new_value_shift_coefficient = 0.1 new_value_shift_coefficient = 0.1 # alpha
number_of_knn = 50 number_of_knn = 50
n_step = 100 n_step = 100
bootstrap_total_return_from_old_policy = True bootstrap_total_return_from_old_policy = True
DND_key_error_threshold = 0.1 DND_key_error_threshold = 0
input_rescaler = 1.0
num_consecutive_playing_steps = 4
class ActorCritic(AgentParameters): class ActorCritic(AgentParameters):

View File

@@ -19,6 +19,7 @@ from logger import *
import gym import gym
import numpy as np import numpy as np
import time import time
import random
try: try:
import roboschool import roboschool
from OpenGL import GL from OpenGL import GL
@@ -59,7 +60,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
# self.env_spec = gym.spec(self.env_id) # self.env_spec = gym.spec(self.env_id)
self.env.frameskip = self.frame_skip self.env.frameskip = self.frame_skip
self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box
self.random_initialization_steps = 0
self.state = self.reset(True)['state'] self.state = self.reset(True)['state']
# render # render
@@ -113,6 +114,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
else: else:
self.timestep_limit = None self.timestep_limit = None
self.measurements_size = len(self.step(0)['info'].keys()) self.measurements_size = len(self.step(0)['info'].keys())
self.random_initialization_steps = self.tp.env.random_initialization_steps
def _wrap_state(self, state): def _wrap_state(self, state):
if isinstance(self.env.observation_space, gym.spaces.Dict): if isinstance(self.env.observation_space, gym.spaces.Dict):
@@ -155,8 +157,9 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
def _preprocess_state(self, state): def _preprocess_state(self, state):
# TODO: move this into wrapper # TODO: move this into wrapper
if any(env in self.env_id for env in ["Breakout", "Pong"]): # crop image for atari games
# crop image # the image from the environment is 210x160
if self.tp.env.crop_observation and hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
state['observation'] = state['observation'][34:195, :, :] state['observation'] = state['observation'][34:195, :, :]
return state return state
@@ -170,7 +173,16 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
self.env.seed(self.seed) self.env.seed(self.seed)
self.state = self._wrap_state(self.env.reset()) self.state = self._wrap_state(self.env.reset())
while self.state is None:
# initialize the number of lives
if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
self.current_ale_lives = self.env.env.ale.lives()
# simulate a random initial environment state by stepping for a random number of times between 0 and 30
step_count = 0
random_initialization_steps = random.randint(0, self.random_initialization_steps)
while self.state is None or step_count < random_initialization_steps:
step_count += 1
self.step(0) self.step(0)
return self.state return self.state

View File

@@ -115,6 +115,14 @@ class ScreenLogger(object):
if default is not None: if default is not None:
return default return default
def change_terminal_title(self, title: str):
"""
Changes the title of the terminal window
:param title: The new title
:return: None
"""
print("\x1b]2;{}\x07".format(title))
class BaseLogger(object): class BaseLogger(object):
def __init__(self): def __init__(self):
@@ -157,6 +165,7 @@ class Logger(BaseLogger):
self.time = None self.time = None
self.experiments_path = "" self.experiments_path = ""
self.last_line_idx_written_to_csv = 0 self.last_line_idx_written_to_csv = 0
self.experiment_name = ""
def set_current_time(self, time): def set_current_time(self, time):
self.time = time self.time = time
@@ -205,7 +214,9 @@ class Logger(BaseLogger):
def signal_value_exists(self, time, signal_name): def signal_value_exists(self, time, signal_name):
try: try:
self.get_signal_value(time, signal_name) value = self.get_signal_value(time, signal_name)
if value != value: # value is nan
return False
except: except:
return False return False
return True return True
@@ -229,7 +240,8 @@ class Logger(BaseLogger):
if self.start_time: if self.start_time:
self.create_signal_value('Wall-Clock Time', time.time() - self.start_time, time=episode) self.create_signal_value('Wall-Clock Time', time.time() - self.start_time, time=episode)
else: else:
self.create_signal_value('Wall-Clock Time', time.time(), time=episode) self.create_signal_value('Wall-Clock Time', 0, time=episode)
self.start_time = time.time()
def create_gif(self, images, fps=10, name="Gif"): def create_gif(self, images, fps=10, name="Gif"):
output_file = '{}_{}.gif'.format(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), name) output_file = '{}_{}.gif'.format(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'), name)
@@ -243,7 +255,7 @@ class Logger(BaseLogger):
def remove_experiment_dir(self): def remove_experiment_dir(self):
shutil.rmtree(self.experiments_path) shutil.rmtree(self.experiments_path)
def print_summary(self): def summarize_experiment(self):
screen.separator() screen.separator()
screen.log_title("Results stored at: {}".format(self.experiments_path)) screen.log_title("Results stored at: {}".format(self.experiments_path))
screen.log_title("Total runtime: {}".format(datetime.datetime.now() - self.time_started)) screen.log_title("Total runtime: {}".format(datetime.datetime.now() - self.time_started))
@@ -273,7 +285,8 @@ class Logger(BaseLogger):
screen.error('Experiment name must be composed only of alphanumeric letters, ' screen.error('Experiment name must be composed only of alphanumeric letters, '
'underscores and dashes and should not be longer than 100 characters.') 'underscores and dashes and should not be longer than 100 characters.')
return match.group(0) self.experiment_name = match.group(0)
return self.experiment_name
def get_experiment_path(self, experiment_name, create_path=True): def get_experiment_path(self, experiment_name, create_path=True):
general_experiments_path = os.path.join('./experiments/', experiment_name) general_experiments_path = os.path.join('./experiments/', experiment_name)

View File

@@ -83,6 +83,11 @@ class AnnoyDictionary(object):
# Returns the stored embeddings and values of the closest embeddings # Returns the stored embeddings and values of the closest embeddings
def query(self, keys, k): def query(self, keys, k):
if not self.has_enough_entries(k):
# this will only happen when the DND is not yet populated with enough entries, which is only during heatup
# these values won't be used and therefore they are meaningless
return [0.0], [0.0], [0]
_, indices = self._get_k_nearest_neighbors_indices(keys, k) _, indices = self._get_k_nearest_neighbors_indices(keys, k)
embeddings = [] embeddings = []
@@ -94,7 +99,7 @@ class AnnoyDictionary(object):
self.current_timestamp += 1 self.current_timestamp += 1
return embeddings, values return embeddings, values, indices
def has_enough_entries(self, k): def has_enough_entries(self, k):
return self.curr_size > k and (self.built_capacity > k) return self.curr_size > k and (self.built_capacity > k)
@@ -133,9 +138,11 @@ class AnnoyDictionary(object):
class QDND: class QDND:
def __init__(self, dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01): def __init__(self, dict_size, key_width, num_actions, new_value_shift_coefficient=0.1, key_error_threshold=0.01,
learning_rate=0.01):
self.num_actions = num_actions self.num_actions = num_actions
self.dicts = [] self.dicts = []
self.learning_rate = learning_rate
# create a dict for each action # create a dict for each action
for a in range(num_actions): for a in range(num_actions):
@@ -155,16 +162,18 @@ class QDND:
self.dicts[a].add(curr_action_embeddings, curr_action_values) self.dicts[a].add(curr_action_embeddings, curr_action_values)
return True return True
def query(self, embeddings, actions, k): def query(self, embeddings, action, k):
# query for nearest neighbors to the given embeddings # query for nearest neighbors to the given embeddings
dnd_embeddings = [] dnd_embeddings = []
dnd_values = [] dnd_values = []
for i, action in enumerate(actions): dnd_indices = []
embedding, value = self.dicts[action].query([embeddings[i]], k) for i in range(len(embeddings)):
embedding, value, indices = self.dicts[action].query([embeddings[i]], k)
dnd_embeddings.append(embedding[0]) dnd_embeddings.append(embedding[0])
dnd_values.append(value[0]) dnd_values.append(value[0])
dnd_indices.append(indices[0])
return dnd_embeddings, dnd_values return dnd_embeddings, dnd_values, dnd_indices
def has_enough_entries(self, k): def has_enough_entries(self, k):
# check if each of the action dictionaries has at least k entries # check if each of the action dictionaries has at least k entries
@@ -193,4 +202,5 @@ def load_dnd(model_dir):
DND.dicts[a].index.add_item(idx, key) DND.dicts[a].index.add_item(idx, key)
DND.dicts[a].index.build(50) DND.dicts[a].index.build(50)
return DND return DND

View File

@@ -16,6 +16,7 @@
from memories.memory import * from memories.memory import *
import threading import threading
from typing import Union
class EpisodicExperienceReplay(Memory): class EpisodicExperienceReplay(Memory):
@@ -103,7 +104,8 @@ class EpisodicExperienceReplay(Memory):
if transition.game_over: if transition.game_over:
self._num_transitions_in_complete_episodes += last_episode.length() self._num_transitions_in_complete_episodes += last_episode.length()
self._length += 1 self._length += 1
self.buffer[-1].update_returns(self.discount, is_bootstrapped=self.return_is_bootstrapped, self.buffer[-1].update_returns(self.discount,
is_bootstrapped=self.tp.agent.bootstrap_total_return_from_old_policy,
n_step_return=self.tp.agent.n_step) n_step_return=self.tp.agent.n_step)
self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead) self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead)
# self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization # self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization
@@ -146,6 +148,17 @@ class EpisodicExperienceReplay(Memory):
def get(self, index): def get(self, index):
return self.get_episode(index) return self.get_episode(index)
def get_last_complete_episode(self) -> Union[None, Episode]:
"""
Returns the last complete episode in the memory or None if there are no complete episodes
:return: None or the last complete episode
"""
last_complete_episode_index = self.num_complete_episodes()-1
if last_complete_episode_index >= 0:
return self.get(last_complete_episode_index)
else:
return None
def update_last_transition_info(self, info): def update_last_transition_info(self, info):
episode = self.buffer[-1] episode = self.buffer[-1]
if episode.length() == 0: if episode.length() == 0:

View File

@@ -80,9 +80,12 @@ class Episode(object):
total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0) total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0)
current_discount *= discount current_discount *= discount
# calculate the bootstrapped returns
bootstraps = np.array([np.squeeze(t.info['max_action_value']) for t in self.transitions[n_step_return:]])
bootstrapped_return = total_return + current_discount * np.pad(bootstraps, (0, n_step_return), 'constant',
constant_values=0)
if is_bootstrapped: if is_bootstrapped:
bootstraps = np.array([np.squeeze(t.info['action_value']) for t in self.transitions[n_step_return:]]) total_return = bootstrapped_return
total_return += current_discount * np.pad(bootstraps, (0, n_step_return), 'constant', constant_values=0)
for transition_idx in range(self.length()): for transition_idx in range(self.length()):
self.transitions[transition_idx].total_return = total_return[transition_idx] self.transitions[transition_idx].total_return = total_return[transition_idx]
@@ -114,7 +117,13 @@ class Episode(object):
return self.returns_table return self.returns_table
def get_returns(self): def get_returns(self):
return [t.total_return for t in self.transitions] return self.get_transitions_attribute('total_return')
def get_transitions_attribute(self, attribute_name):
if hasattr(self.transitions[0], attribute_name):
return [t.__dict__[attribute_name] for t in self.transitions]
else:
raise ValueError("The transitions have no such attribute name")
def to_batch(self): def to_batch(self):
batch = [] batch = []
@@ -141,14 +150,12 @@ class Transition(object):
:param game_over: A boolean which should be True if the episode terminated after :param game_over: A boolean which should be True if the episode terminated after
the execution of the action. the execution of the action.
""" """
self.state = copy.deepcopy(state) self.state = state
self.state['observation'] = np.array(self.state['observation'], copy=False)
self.action = action self.action = action
self.reward = reward self.reward = reward
self.total_return = None self.total_return = None
if not next_state: if not next_state:
next_state = state next_state = state
self.next_state = copy.deepcopy(next_state) self.next_state = next_state
self.next_state['observation'] = np.array(self.next_state['observation'], copy=False)
self.game_over = game_over self.game_over = game_over
self.info = {} self.info = {}

105
plot_atari.py Normal file
View File

@@ -0,0 +1,105 @@
import argparse
import matplotlib
import matplotlib.pyplot as plt
from dashboard import SignalsFile
import os
class FigureMaker(object):
def __init__(self, path, cols, smoothness, signal_to_plot, x_axis):
self.experiments_path = path
self.environments = self.list_environments()
self.cols = cols
self.rows = int((len(self.environments) + cols - 1) / cols)
self.smoothness = smoothness
self.signal_to_plot = signal_to_plot
self.x_axis = x_axis
params = {
'axes.labelsize': 8,
'font.size': 10,
'legend.fontsize': 14,
'xtick.labelsize': 8,
'ytick.labelsize': 8,
'text.usetex': False,
'figure.figsize': [16, 30]
}
matplotlib.rcParams.update(params)
def list_environments(self):
environments = sorted([e.name for e in os.scandir(args.path) if e.is_dir()])
filtered_environments = self.filter_environments(environments)
return filtered_environments
def filter_environments(self, environments):
filtered_environments = []
for idx, environment in enumerate(environments):
path = os.path.join(args.path, environment)
experiments = [e.name for e in os.scandir(path) if e.is_dir()]
# take only the last updated experiment directory
last_experiment_dir = max([os.path.join(path, root) for root in experiments], key=os.path.getctime)
# make sure there is a csv file inside it
for file_path in os.listdir(last_experiment_dir):
full_file_path = os.path.join(last_experiment_dir, file_path)
if os.path.isfile(full_file_path) and file_path.endswith('.csv'):
filtered_environments.append((environment, full_file_path))
return filtered_environments
def plot_figures(self):
for idx, (environment, full_file_path) in enumerate(self.environments):
print(environment)
axis = plt.subplot(self.rows, self.cols, idx + 1)
signals = SignalsFile(full_file_path)
signals.change_averaging_window(self.smoothness, force=True, signals=[self.signal_to_plot])
steps = signals.bokeh_source.data[self.x_axis]
rewards = signals.bokeh_source.data[self.signal_to_plot]
yloc = plt.MaxNLocator(4)
axis.yaxis.set_major_locator(yloc)
axis.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
plt.title(environment, fontsize=10, y=1.08)
plt.plot(steps, rewards, linewidth=0.8)
plt.subplots_adjust(hspace=2.0, wspace=0.4)
def save_pdf(self, name):
plt.savefig(name + ".pdf", bbox_inches='tight')
def show_figures(self):
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--path',
help="(string) Root directory of the experiments",
default=None,
type=str)
parser.add_argument('-c', '--cols',
help="(int) Number of plot columns",
default=6,
type=int)
parser.add_argument('-s', '--smoothness',
help="(int) Number of consequent episodes to average over",
default=200,
type=int)
parser.add_argument('-sig', '--signal',
help="(str) The name of the signal to plot",
default='Evaluation Reward',
type=str)
parser.add_argument('-x', '--x_axis',
help="(str) The meaning of the x axis",
default='Total steps',
type=str)
parser.add_argument('-pdf', '--pdf',
help="(str) A name of a pdf to save to",
default='atari',
type=str)
args = parser.parse_args()
maker = FigureMaker(args.path, cols=args.cols, smoothness=args.smoothness, signal_to_plot=args.signal, x_axis=args.x_axis)
maker.plot_figures()
maker.save_pdf(args.pdf)
maker.show_figures()

View File

@@ -89,7 +89,6 @@ class Doom_Basic_QRDQN(Preset):
self.num_heatup_steps = 1000 self.num_heatup_steps = 1000
class Doom_Basic_OneStepQ(Preset): class Doom_Basic_OneStepQ(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, NStepQ, Doom, ExplorationParameters) Preset.__init__(self, NStepQ, Doom, ExplorationParameters)
@@ -408,8 +407,67 @@ class Breakout_DQN(Preset):
self.exploration.evaluation_policy = 'EGreedy' self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.05 self.exploration.evaluation_epsilon = 0.05
self.num_heatup_steps = 50000 self.num_heatup_steps = 50000
self.agent.num_consecutive_playing_steps = 4
self.evaluation_episodes = 1 self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 100 self.evaluate_every_x_episodes = 25
self.agent.replace_mse_with_huber_loss = True
# self.env.crop_observation = True # TODO: remove
# self.rescaling_interpolation_type = 'nearest' # TODO: remove
class Breakout_DDQN(Preset):
def __init__(self):
Preset.__init__(self, DDQN, Atari, ExplorationParameters)
self.env.level = 'BreakoutDeterministic-v4'
self.agent.num_steps_between_copying_online_weights_to_target = 30000
self.learning_rate = 0.00025
self.agent.num_transitions_in_experience_replay = 1000000
self.exploration.initial_epsilon = 1.0
self.exploration.final_epsilon = 0.01
self.exploration.epsilon_decay_steps = 1000000
self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.001
self.num_heatup_steps = 50000
self.agent.num_consecutive_playing_steps = 4
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 25
self.agent.replace_mse_with_huber_loss = True
class Breakout_Dueling_DDQN(Preset):
def __init__(self):
Preset.__init__(self, DDQN, Atari, ExplorationParameters)
self.env.level = 'BreakoutDeterministic-v4'
self.agent.output_types = [OutputTypes.DuelingQ]
self.agent.num_steps_between_copying_online_weights_to_target = 30000
self.learning_rate = 0.00025
self.agent.num_transitions_in_experience_replay = 1000000
self.exploration.initial_epsilon = 1.0
self.exploration.final_epsilon = 0.01
self.exploration.epsilon_decay_steps = 1000000
self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.001
self.num_heatup_steps = 50000
self.agent.num_consecutive_playing_steps = 4
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 25
self.agent.replace_mse_with_huber_loss = True
class Alien_DQN(Preset):
def __init__(self):
Preset.__init__(self, DQN, Atari, ExplorationParameters)
self.env.level = 'AlienDeterministic-v4'
self.agent.num_steps_between_copying_online_weights_to_target = 10000
self.learning_rate = 0.00025
self.agent.num_transitions_in_experience_replay = 1000000
self.exploration.initial_epsilon = 1.0
self.exploration.final_epsilon = 0.1
self.exploration.epsilon_decay_steps = 1000000
self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.05
self.num_heatup_steps = 50000
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 5
class Breakout_C51(Preset): class Breakout_C51(Preset):
@@ -846,7 +904,8 @@ class CartPole_NEC(Preset):
self.num_heatup_steps = 1000 self.num_heatup_steps = 1000
self.exploration.epsilon_decay_steps = 1000 self.exploration.epsilon_decay_steps = 1000
self.exploration.final_epsilon = 0.1 self.exploration.final_epsilon = 0.1
self.agent.discount = 1.0 self.agent.discount = 0.99
self.seed = 0
self.test = True self.test = True
self.test_max_step_threshold = 200 self.test_max_step_threshold = 200
@@ -857,10 +916,16 @@ class Doom_Basic_NEC(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, NEC, Doom, ExplorationParameters) Preset.__init__(self, NEC, Doom, ExplorationParameters)
self.env.level = 'basic' self.env.level = 'basic'
self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00001
self.learning_rate = 0.00025 self.agent.num_transitions_in_experience_replay = 100000
self.num_heatup_steps = 1000 # self.exploration.initial_epsilon = 0.1 # TODO: try exploration
self.agent.num_playing_steps_between_two_training_steps = 1 # self.exploration.final_epsilon = 0.1
# self.exploration.epsilon_decay_steps = 1000000
self.num_heatup_steps = 200
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 5
self.seed = 123
class Montezuma_NEC(Preset): class Montezuma_NEC(Preset):
@@ -877,12 +942,20 @@ class Breakout_NEC(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, NEC, Atari, ExplorationParameters) Preset.__init__(self, NEC, Atari, ExplorationParameters)
self.env.level = 'BreakoutDeterministic-v4' self.env.level = 'BreakoutDeterministic-v4'
self.learning_rate = 0.00025 self.agent.num_steps_between_copying_online_weights_to_target = 10000
self.learning_rate = 0.00001
self.agent.num_transitions_in_experience_replay = 1000000 self.agent.num_transitions_in_experience_replay = 1000000
self.exploration.initial_epsilon = 1.0 self.exploration.initial_epsilon = 0.1
self.exploration.final_epsilon = 0.1 self.exploration.final_epsilon = 0.1
self.exploration.epsilon_decay_steps = 1000000 self.exploration.epsilon_decay_steps = 1000000
self.num_heatup_steps = 50000 self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.05
self.num_heatup_steps = 1000
self.env.reward_clipping_max = None
self.env.reward_clipping_min = None
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 25
self.seed = 123
class Doom_Health_NEC(Preset): class Doom_Health_NEC(Preset):
@@ -924,12 +997,54 @@ class Pong_NEC(Preset):
def __init__(self): def __init__(self):
Preset.__init__(self, NEC, Atari, ExplorationParameters) Preset.__init__(self, NEC, Atari, ExplorationParameters)
self.env.level = 'PongDeterministic-v4' self.env.level = 'PongDeterministic-v4'
self.learning_rate = 0.001 self.learning_rate = 0.00001
self.agent.num_transitions_in_experience_replay = 100000 self.agent.num_transitions_in_experience_replay = 100000
self.exploration.initial_epsilon = 0.5 self.exploration.initial_epsilon = 0.1 # TODO: try exploration
self.exploration.final_epsilon = 0.1 self.exploration.final_epsilon = 0.1
self.exploration.epsilon_decay_steps = 1000000 self.exploration.epsilon_decay_steps = 1000000
self.num_heatup_steps = 2000
self.env.reward_clipping_max = None
self.env.reward_clipping_min = None
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 5
self.env.crop_observation = True # TODO: remove
self.env.random_initialization_steps = 1 # TODO: remove
# self.seed = 123
class Alien_NEC(Preset):
def __init__(self):
Preset.__init__(self, NEC, Atari, ExplorationParameters)
self.env.level = 'AlienDeterministic-v4'
self.learning_rate = 0.0001
self.agent.num_transitions_in_experience_replay = 100000
self.exploration.initial_epsilon = 0.1 # TODO: try exploration
self.exploration.final_epsilon = 0.1
self.exploration.epsilon_decay_steps = 1000000
self.num_heatup_steps = 3000
self.env.reward_clipping_max = None
self.env.reward_clipping_min = None
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 5
self.seed = 123
class Pong_DQN(Preset):
def __init__(self):
Preset.__init__(self, DQN, Atari, ExplorationParameters)
self.env.level = 'PongDeterministic-v4'
self.agent.num_steps_between_copying_online_weights_to_target = 10000
self.learning_rate = 0.00025
self.agent.num_transitions_in_experience_replay = 1000000
self.exploration.initial_epsilon = 1.0
self.exploration.final_epsilon = 0.1
self.exploration.epsilon_decay_steps = 1000000
self.exploration.evaluation_policy = 'EGreedy'
self.exploration.evaluation_epsilon = 0.05
self.num_heatup_steps = 50000 self.num_heatup_steps = 50000
self.evaluation_episodes = 1
self.evaluate_every_x_episodes = 5
self.seed = 123
class CartPole_A3C(Preset): class CartPole_A3C(Preset):

View File

@@ -50,6 +50,9 @@ if __name__ == '__main__':
parser.add_argument('-v', '--verbose', parser.add_argument('-v', '--verbose',
help="(flag) display verbose logs in the event of an error", help="(flag) display verbose logs in the event of an error",
action='store_true') action='store_true')
parser.add_argument('-l', '--list_presets',
help="(flag) list all the presets that are tested",
action='store_true')
parser.add_argument('--stop_after_first_failure', parser.add_argument('--stop_after_first_failure',
help="(flag) stop executing tests after the first error", help="(flag) stop executing tests after the first error",
action='store_true') action='store_true')
@@ -73,6 +76,14 @@ if __name__ == '__main__':
presets_to_ignore = args.ignore_presets.split(',') presets_to_ignore = args.ignore_presets.split(',')
else: else:
presets_to_ignore = [] presets_to_ignore = []
if args.list_presets:
for idx, preset_name in enumerate(presets_lists):
preset = eval('presets.{}()'.format(preset_name))
if preset.test and preset_name not in presets_to_ignore:
print(preset_name)
exit(0)
for idx, preset_name in enumerate(presets_lists): for idx, preset_name in enumerate(presets_lists):
preset = eval('presets.{}()'.format(preset_name)) preset = eval('presets.{}()'.format(preset_name))
if preset.test and preset_name not in presets_to_ignore: if preset.test and preset_name not in presets_to_ignore:

View File

@@ -21,6 +21,7 @@ import numpy as np
import threading import threading
from subprocess import call, Popen from subprocess import call, Popen
import signal import signal
import copy
killed_processes = [] killed_processes = []
@@ -333,6 +334,23 @@ def switch_axes_order(observation, from_type='channels_first', to_type='channels
return np.transpose(observation, (1, 0)) return np.transpose(observation, (1, 0))
class LazyStack(object):
"""
A lazy version of np.stack which avoids copying the memory until it is
needed.
"""
def __init__(self, history, axis=None):
self.history = copy.copy(history)
self.axis = axis
def __array__(self, dtype=None):
array = np.stack(self.history, axis=self.axis)
if dtype is not None:
array = array.astype(dtype)
return array
def stack_observation(curr_stack, observation, stack_size): def stack_observation(curr_stack, observation, stack_size):
""" """
Adds a new observation to an existing stack of observations from previous time-steps. Adds a new observation to an existing stack of observations from previous time-steps.