1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

temp commit

This commit is contained in:
Zach Dwiel
2018-02-16 09:35:58 -05:00
parent 16c5032735
commit 85afb86893
14 changed files with 244 additions and 127 deletions

View File

@@ -20,6 +20,17 @@ from utils import *
import scipy.signal import scipy.signal
def last_sample(state):
"""
given a batch of states, return the last sample of the batch with length 1
batch axis.
"""
return {
k: np.expand_dims(v[-1], 0)
for k, v in state.items()
}
# Actor Critic - https://arxiv.org/abs/1602.01783 # Actor Critic - https://arxiv.org/abs/1602.01783
class ActorCriticAgent(PolicyOptimizationAgent): class ActorCriticAgent(PolicyOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
@@ -76,7 +87,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
if game_overs[-1]: if game_overs[-1]:
R = 0 R = 0
else: else:
R = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0] R = self.main_network.online_network.predict(last_sample(next_states))[0]
for i in reversed(range(num_transitions)): for i in reversed(range(num_transitions)):
R = rewards[i] + self.tp.agent.discount * R R = rewards[i] + self.tp.agent.discount * R
@@ -85,7 +96,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps # get bootstraps
bootstrapped_value = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0] bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0]
values = np.append(current_state_values, bootstrapped_value) values = np.append(current_state_values, bootstrapped_value)
if game_overs[-1]: if game_overs[-1]:
values[-1] = 0 values[-1] = 0
@@ -101,7 +112,9 @@ class ActorCriticAgent(PolicyOptimizationAgent):
actions = np.expand_dims(actions, -1) actions = np.expand_dims(actions, -1)
# train # train
result = self.main_network.online_network.accumulate_gradients([current_states, actions], inputs = copy.copy(current_states)
inputs['output_1_0'] = actions
result = self.main_network.online_network.accumulate_gradients(inputs,
[state_value_head_targets, action_advantages]) [state_value_head_targets, action_advantages])
# logging # logging
@@ -114,11 +127,17 @@ class ActorCriticAgent(PolicyOptimizationAgent):
return total_loss return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN): def choose_action(self, curr_state, phase=RunPhase.TRAIN):
# TODO: rename curr_state -> state
# convert to batch so we can run it through the network # convert to batch so we can run it through the network
observation = np.expand_dims(np.array(curr_state['observation']), 0) curr_state = {
k: np.expand_dims(np.array(curr_state[k]), 0)
for k in curr_state.keys()
}
if self.env.discrete_controls: if self.env.discrete_controls:
# DISCRETE # DISCRETE
state_value, action_probabilities = self.main_network.online_network.predict(observation) state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
action_probabilities = action_probabilities.squeeze() action_probabilities = action_probabilities.squeeze()
if phase == RunPhase.TRAIN: if phase == RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_probabilities) action = self.exploration_policy.get_action(action_probabilities)
@@ -128,7 +147,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps))) self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
else: else:
# CONTINUOUS # CONTINUOUS
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(observation) state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
action_values_mean = action_values_mean.squeeze() action_values_mean = action_values_mean.squeeze()
action_values_std = action_values_std.squeeze() action_values_std = action_values_std.squeeze()
if phase == RunPhase.TRAIN: if phase == RunPhase.TRAIN:

View File

@@ -93,7 +93,7 @@ class Agent(object):
self.running_reward = None self.running_reward = None
self.training_iteration = 0 self.training_iteration = 0
self.current_episode = self.tp.current_episode = 0 self.current_episode = self.tp.current_episode = 0
self.curr_state = [] self.curr_state = {}
self.current_episode_steps_counter = 0 self.current_episode_steps_counter = 0
self.episode_running_info = {} self.episode_running_info = {}
self.last_episode_evaluation_ran = 0 self.last_episode_evaluation_ran = 0
@@ -194,7 +194,7 @@ class Agent(object):
for signal in self.signals: for signal in self.signals:
signal.reset() signal.reset()
self.total_reward_in_current_episode = 0 self.total_reward_in_current_episode = 0
self.curr_state = [] self.curr_state = {}
self.last_episode_images = [] self.last_episode_images = []
self.current_episode_steps_counter = 0 self.current_episode_steps_counter = 0
self.episode_running_info = {} self.episode_running_info = {}
@@ -289,23 +289,20 @@ class Agent(object):
:param batch: An array of transitions :param batch: An array of transitions
:return: For each transition element, returns a numpy array of all the transitions in the batch :return: For each transition element, returns a numpy array of all the transitions in the batch
""" """
current_states = {}
next_states = {}
current_observations = np.array([transition.state['observation'] for transition in batch]) current_states['observation'] = np.array([transition.state['observation'] for transition in batch])
next_observations = np.array([transition.next_state['observation'] for transition in batch]) next_states['observation'] = np.array([transition.next_state['observation'] for transition in batch])
actions = np.array([transition.action for transition in batch]) actions = np.array([transition.action for transition in batch])
rewards = np.array([transition.reward for transition in batch]) rewards = np.array([transition.reward for transition in batch])
game_overs = np.array([transition.game_over for transition in batch]) game_overs = np.array([transition.game_over for transition in batch])
total_return = np.array([transition.total_return for transition in batch]) total_return = np.array([transition.total_return for transition in batch])
current_states = current_observations
next_states = next_observations
# get the entire state including measurements if available # get the entire state including measurements if available
if self.tp.agent.use_measurements: if self.tp.agent.use_measurements:
current_measurements = np.array([transition.state['measurements'] for transition in batch]) current_states['measurements'] = np.array([transition.state['measurements'] for transition in batch])
next_measurements = np.array([transition.next_state['measurements'] for transition in batch]) next_states['measurements'] = np.array([transition.next_state['measurements'] for transition in batch])
current_states = [current_observations, current_measurements]
next_states = [next_observations, next_measurements]
return current_states, next_states, actions, rewards, game_overs, total_return return current_states, next_states, actions, rewards, game_overs, total_return
@@ -353,12 +350,24 @@ class Agent(object):
# get new action # get new action
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0} action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0}
is_first_transition_in_episode = (self.curr_state == []) is_first_transition_in_episode = (self.curr_state == {})
if is_first_transition_in_episode: if is_first_transition_in_episode:
observation = self.preprocess_observation(self.env.observation) if not isinstance(self.env.state, dict):
observation = stack_observation([], observation, self.tp.env.observation_stack_size) raise ValueError((
'expected state to be a dictionary, found {}'
).format(type(self.env.state)))
self.curr_state = {'observation': observation} state = self.env.state
# TODO: modify preprocess_observation to modify the entire state
# for now, only preprocess the observation
state['observation'] = self.preprocess_observation(state['observation'])
# TODO: provide option to stack more than just the observation
# TODO: this should probably be happening in an environment wrapper anyway
state['observation'] = stack_observation([], state['observation'], self.tp.env.observation_stack_size)
self.curr_state = state
# TODO: this should be handled in the environment
if self.tp.agent.use_measurements: if self.tp.agent.use_measurements:
self.curr_state['measurements'] = self.env.measurements self.curr_state['measurements'] = self.env.measurements
if self.tp.agent.use_accumulated_reward_as_measurement: if self.tp.agent.use_accumulated_reward_as_measurement:
@@ -373,22 +382,25 @@ class Agent(object):
if type(action) == np.ndarray: if type(action) == np.ndarray:
action = action.squeeze() action = action.squeeze()
result = self.env.step(action) result = self.env.step(action)
shaped_reward = self.preprocess_reward(result['reward']) shaped_reward = self.preprocess_reward(result['reward'])
if 'action_intrinsic_reward' in action_info.keys(): if 'action_intrinsic_reward' in action_info.keys():
shaped_reward += action_info['action_intrinsic_reward'] shaped_reward += action_info['action_intrinsic_reward']
# TODO: should total_reward_in_current_episode include shaped_reward?
self.total_reward_in_current_episode += result['reward'] self.total_reward_in_current_episode += result['reward']
observation = self.preprocess_observation(result['observation']) next_state = result['state']
next_state['observation'] = self.preprocess_observation(next_state['observation'])
# plot action values online # plot action values online
if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
self.plot_action_values_online() self.plot_action_values_online()
# initialize the next state # initialize the next state
observation = stack_observation(self.curr_state['observation'], observation, self.tp.env.observation_stack_size) # TODO: provide option to stack more than just the observation
next_state['observation'] = stack_observation(self.curr_state['observation'], next_state['observation'], self.tp.env.observation_stack_size)
next_state = {'observation': observation}
if self.tp.agent.use_measurements and 'measurements' in result.keys(): if self.tp.agent.use_measurements and 'measurements' in result.keys():
next_state['measurements'] = result['measurements'] next_state['measurements'] = result['state']['measurements']
if self.tp.agent.use_accumulated_reward_as_measurement: if self.tp.agent.use_accumulated_reward_as_measurement:
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)

View File

@@ -48,7 +48,7 @@ class TensorFlowArchitecture(Architecture):
self.network_is_local = network_is_local self.network_is_local = network_is_local
assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent' assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent'
self.sess = tuning_parameters.sess self.sess = tuning_parameters.sess
self.inputs = [] self.inputs = {}
self.outputs = [] self.outputs = []
self.targets = [] self.targets = []
self.losses = [] self.losses = []
@@ -106,7 +106,8 @@ class TensorFlowArchitecture(Architecture):
# gradients of the outputs w.r.t. the inputs # gradients of the outputs w.r.t. the inputs
# at the moment, this is only used by ddpg # at the moment, this is only used by ddpg
if len(self.outputs) == 1: if len(self.outputs) == 1:
self.gradients_wrt_inputs = [tf.gradients(self.outputs[0], input_ph) for input_ph in self.inputs] # TODO: convert gradients_with_respect_to_inputs into dictionary?
self.gradients_wrt_inputs = [tf.gradients(self.outputs[0], input_ph) for input_ph in self.inputs.values()]
self.gradients_weights_ph = tf.placeholder('float32', self.outputs[0].shape, 'output_gradient_weights') self.gradients_weights_ph = tf.placeholder('float32', self.outputs[0].shape, 'output_gradient_weights')
self.weighted_gradients = tf.gradients(self.outputs[0], self.trainable_weights, self.gradients_weights_ph) self.weighted_gradients = tf.gradients(self.outputs[0], self.trainable_weights, self.gradients_weights_ph)
@@ -169,9 +170,8 @@ class TensorFlowArchitecture(Architecture):
# feed inputs # feed inputs
if additional_fetches is None: if additional_fetches is None:
additional_fetches = [] additional_fetches = []
inputs = force_list(inputs)
feed_dict = dict(zip(self.inputs, inputs)) feed_dict = self._feed_dict(inputs)
# feed targets # feed targets
targets = force_list(targets) targets = force_list(targets)
@@ -266,6 +266,12 @@ class TensorFlowArchitecture(Architecture):
while self.tp.sess.run(self.release_counter) % self.tp.num_threads != 0: while self.tp.sess.run(self.release_counter) % self.tp.num_threads != 0:
time.sleep(0.00001) time.sleep(0.00001)
def _feed_dict(self, inputs):
return {
self.inputs[input_name]: input_value
for input_name, input_value in inputs.items()
}
def predict(self, inputs, outputs=None): def predict(self, inputs, outputs=None):
""" """
Run a forward pass of the network using the given input Run a forward pass of the network using the given input
@@ -275,8 +281,8 @@ class TensorFlowArchitecture(Architecture):
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step. WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
""" """
# TODO: rename self.inputs -> self.input_placeholders
feed_dict = dict(zip(self.inputs, force_list(inputs))) feed_dict = self._feed_dict(inputs)
if outputs is None: if outputs is None:
outputs = self.outputs outputs = self.outputs
@@ -290,21 +296,21 @@ class TensorFlowArchitecture(Architecture):
return squeeze_list(output) return squeeze_list(output)
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None): # def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None):
""" # """
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients # Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
:param additional_fetches: Optional tensors to fetch during the training process # :param additional_fetches: Optional tensors to fetch during the training process
:param inputs: The input for the network # :param inputs: The input for the network
:param targets: The targets corresponding to the input batch # :param targets: The targets corresponding to the input batch
:param scaler: A scaling factor that allows rescaling the gradients before applying them # :param scaler: A scaling factor that allows rescaling the gradients before applying them
:return: The loss of the network # :return: The loss of the network
""" # """
if additional_fetches is None: # if additional_fetches is None:
additional_fetches = [] # additional_fetches = []
force_list(additional_fetches) # force_list(additional_fetches)
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches) # loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches)
self.apply_and_reset_gradients(self.accumulated_gradients, scaler) # self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
return loss # return loss
def get_weights(self): def get_weights(self):
""" """

View File

@@ -112,7 +112,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
#################### ####################
state_embedding = [] state_embedding = []
for idx, input_type in enumerate(self.tp.agent.input_types): for input_name, input_type in self.tp.agent.input_types.items():
# get the class of the input embedder # get the class of the input embedder
input_embedder = self.get_input_embedder(input_type) input_embedder = self.get_input_embedder(input_type)
self.input_embedders.append(input_embedder) self.input_embedders.append(input_embedder)
@@ -122,9 +122,9 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
# the existing input_placeholders into the input_embedders. # the existing input_placeholders into the input_embedders.
if network_idx == 0: if network_idx == 0:
input_placeholder, embedding = input_embedder() input_placeholder, embedding = input_embedder()
self.inputs.append(input_placeholder) self.inputs[input_name] = input_placeholder
else: else:
input_placeholder, embedding = input_embedder(self.inputs[idx]) input_placeholder, embedding = input_embedder(self.inputs[input_name])
state_embedding.append(embedding) state_embedding.append(embedding)
@@ -159,13 +159,15 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
# build the head # build the head
if self.network_is_local: if self.network_is_local:
output, target_placeholder, input_placeholder = self.output_heads[-1](head_input) output, target_placeholder, input_placeholders = self.output_heads[-1](head_input)
self.targets.extend(target_placeholder) self.targets.extend(target_placeholder)
else: else:
output, input_placeholder = self.output_heads[-1](head_input) output, input_placeholders = self.output_heads[-1](head_input)
self.outputs.extend(output) self.outputs.extend(output)
self.inputs.extend(input_placeholder) # TODO: use head names as well
for placeholder_index, input_placeholder in enumerate(input_placeholders):
self.inputs['output_{}_{}'.format(head_idx, placeholder_index)] = input_placeholder
# Losses # Losses
self.losses = tf.losses.get_losses(self.name) self.losses = tf.losses.get_losses(self.name)

View File

@@ -250,7 +250,7 @@ class MeasurementsPredictionHead(Head):
name='output') name='output')
action_stream = tf.reshape(action_stream, action_stream = tf.reshape(action_stream,
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size)) (tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keep_dims=True) action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
# merge to future measurements predictions # merge to future measurements predictions
self.output = tf.add(expectation_stream, action_stream, name='output') self.output = tf.add(expectation_stream, action_stream, name='output')
@@ -302,7 +302,7 @@ class DNDQHead(Head):
square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1)) square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1))
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta] distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
weights = 1.0 / distances weights = 1.0 / distances
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True) normalised_weights = weights / tf.reduce_sum(weights, axis=1, keepdims=True)
return tf.reduce_sum(dnd_values * normalised_weights, axis=1) return tf.reduce_sum(dnd_values * normalised_weights, axis=1)

View File

@@ -30,6 +30,14 @@ from subprocess import Popen
import datetime import datetime
import presets import presets
import atexit import atexit
import sys
import subprocess
from threading import Thread
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty # for Python 3.x
if len(set(failed_imports)) > 0: if len(set(failed_imports)) > 0:
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
@@ -152,6 +160,38 @@ def run_dict_to_json(_run_dict, task_id=''):
return json_path return json_path
def enqueue_output(out, queue):
for line in iter(out.readline, b''):
queue.put(line)
out.close()
def merge_streams(processes, output_stream=sys.stdout):
q = Queue()
threads = []
for p in processes:
threads.append(Thread(target=enqueue_output, args=(p.stdout, q)))
threads.append(Thread(target=enqueue_output, args=(p.stderr, q)))
for t in threads:
t.daemon = True
t.start()
while True:
try:
line = q.get_nowait()
except Empty:
# break when all processes are done and q is empty
if all(p.poll() is not None for p in processes):
break
else:
# sys.stdout.write(line)
output_stream.write(line.decode(output_stream.encoding))
output_stream.flush()
print('All processes done')
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-p', '--preset', parser.add_argument('-p', '--preset',
@@ -252,6 +292,8 @@ if __name__ == "__main__":
if not args.no_summary: if not args.no_summary:
atexit.register(logger.print_summary) atexit.register(logger.print_summary)
set_cpu()
# Single-threaded runs # Single-threaded runs
if run_dict['num_threads'] == 1: if run_dict['num_threads'] == 1:
# set tuning parameters # set tuning parameters
@@ -285,11 +327,13 @@ if __name__ == "__main__":
set_cpu() set_cpu()
# create a parameter server # create a parameter server
Popen(["python3", parameter_server = Popen([
"python3",
"./parallel_actor.py", "./parallel_actor.py",
"--ps_hosts={}".format(ps_hosts), "--ps_hosts={}".format(ps_hosts),
"--worker_hosts={}".format(worker_hosts), "--worker_hosts={}".format(worker_hosts),
"--job_name=ps"]) "--job_name=ps",
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
screen.log_title("*** Distributed Training ***") screen.log_title("*** Distributed Training ***")
time.sleep(1) time.sleep(1)
@@ -314,13 +358,15 @@ if __name__ == "__main__":
"--job_name=worker", "--job_name=worker",
"--load_json={}".format(json_run_dict_path)] "--load_json={}".format(json_run_dict_path)]
p = Popen(workers_args) p = Popen(workers_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
if i != run_dict['num_threads']: if i != run_dict['num_threads']:
workers.append(p) workers.append(p)
else: else:
evaluation_worker = p evaluation_worker = p
merge_streams(workers + [parameter_server])
# wait for all workers # wait for all workers
[w.wait() for w in workers] [w.wait() for w in workers]
evaluation_worker.kill() evaluation_worker.kill()

View File

@@ -77,7 +77,7 @@ class AgentParameters(Parameters):
agent = '' agent = ''
# Architecture parameters # Architecture parameters
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
middleware_type = MiddlewareTypes.FC middleware_type = MiddlewareTypes.FC
loss_weights = [1.0] loss_weights = [1.0]
@@ -327,7 +327,7 @@ class Human(AgentParameters):
class NStepQ(AgentParameters): class NStepQ(AgentParameters):
type = 'NStepQAgent' type = 'NStepQAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
optimizer_type = 'Adam' optimizer_type = 'Adam'
@@ -343,7 +343,7 @@ class NStepQ(AgentParameters):
class DQN(AgentParameters): class DQN(AgentParameters):
type = 'DQNAgent' type = 'DQNAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
optimizer_type = 'Adam' optimizer_type = 'Adam'
@@ -385,7 +385,7 @@ class QuantileRegressionDQN(DQN):
class NEC(AgentParameters): class NEC(AgentParameters):
type = 'NECAgent' type = 'NECAgent'
optimizer_type = 'RMSProp' optimizer_type = 'RMSProp'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.DNDQ] output_types = [OutputTypes.DNDQ]
loss_weights = [1.0] loss_weights = [1.0]
dnd_size = 500000 dnd_size = 500000
@@ -399,7 +399,7 @@ class NEC(AgentParameters):
class ActorCritic(AgentParameters): class ActorCritic(AgentParameters):
type = 'ActorCriticAgent' type = 'ActorCriticAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.Pi] output_types = [OutputTypes.V, OutputTypes.Pi]
loss_weights = [0.5, 1.0] loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False] stop_gradients_from_head = [False, False]
@@ -417,7 +417,7 @@ class ActorCritic(AgentParameters):
class PolicyGradient(AgentParameters): class PolicyGradient(AgentParameters):
type = 'PolicyGradientsAgent' type = 'PolicyGradientsAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Pi] output_types = [OutputTypes.Pi]
loss_weights = [1.0] loss_weights = [1.0]
num_episodes_in_experience_replay = 2 num_episodes_in_experience_replay = 2
@@ -430,7 +430,7 @@ class PolicyGradient(AgentParameters):
class DDPG(AgentParameters): class DDPG(AgentParameters):
type = 'DDPGAgent' type = 'DDPGAgent'
input_types = [InputTypes.Observation, InputTypes.Action] input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'relu' hidden_layers_activation_function = 'relu'
@@ -443,7 +443,7 @@ class DDPG(AgentParameters):
class DDDPG(AgentParameters): class DDDPG(AgentParameters):
type = 'DDPGAgent' type = 'DDPGAgent'
input_types = [InputTypes.Observation, InputTypes.Action] input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'relu' hidden_layers_activation_function = 'relu'
@@ -456,7 +456,7 @@ class DDDPG(AgentParameters):
class NAF(AgentParameters): class NAF(AgentParameters):
type = 'NAFAgent' type = 'NAFAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.NAF] output_types = [OutputTypes.NAF]
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'tanh' hidden_layers_activation_function = 'tanh'
@@ -469,7 +469,7 @@ class NAF(AgentParameters):
class PPO(AgentParameters): class PPO(AgentParameters):
type = 'PPOAgent' type = 'PPOAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V] output_types = [OutputTypes.V]
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'tanh' hidden_layers_activation_function = 'tanh'
@@ -489,7 +489,7 @@ class PPO(AgentParameters):
class ClippedPPO(AgentParameters): class ClippedPPO(AgentParameters):
type = 'ClippedPPOAgent' type = 'ClippedPPOAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.PPO] output_types = [OutputTypes.V, OutputTypes.PPO]
loss_weights = [0.5, 1.0] loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False] stop_gradients_from_head = [False, False]
@@ -515,7 +515,11 @@ class ClippedPPO(AgentParameters):
class DFP(AgentParameters): class DFP(AgentParameters):
type = 'DFPAgent' type = 'DFPAgent'
input_types = [InputTypes.Observation, InputTypes.Measurements, InputTypes.GoalVector] input_types = {
'observation': InputTypes.Observation,
'measurements': InputTypes.Measurements,
'goal': InputTypes.GoalVector
}
output_types = [OutputTypes.MeasurementsPrediction] output_types = [OutputTypes.MeasurementsPrediction]
loss_weights = [1.0] loss_weights = [1.0]
use_measurements = True use_measurements = True
@@ -527,7 +531,7 @@ class DFP(AgentParameters):
class MMC(AgentParameters): class MMC(AgentParameters):
type = 'MixedMonteCarloAgent' type = 'MixedMonteCarloAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
num_steps_between_copying_online_weights_to_target = 1000 num_steps_between_copying_online_weights_to_target = 1000
@@ -537,7 +541,7 @@ class MMC(AgentParameters):
class PAL(AgentParameters): class PAL(AgentParameters):
type = 'PALAgent' type = 'PALAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
pal_alpha = 0.9 pal_alpha = 0.9
@@ -548,7 +552,7 @@ class PAL(AgentParameters):
class BC(AgentParameters): class BC(AgentParameters):
type = 'BCAgent' type = 'BCAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
collect_new_data = False collect_new_data = False

View File

@@ -161,7 +161,6 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
measurements = [] measurements = []
while type(measurements) == list: while type(measurements) == list:
measurements, sensor_data = self.game.read_data() measurements, sensor_data = self.game.read_data()
self.observation = sensor_data['CameraRGB'].data
self.location = (measurements.player_measurements.transform.location.x, self.location = (measurements.player_measurements.transform.location.x,
measurements.player_measurements.transform.location.y, measurements.player_measurements.transform.location.y,
@@ -181,7 +180,10 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
- np.abs(self.control.steer) * 10 - np.abs(self.control.steer) * 10
# update measurements # update measurements
self.measurements = [measurements.player_measurements.forward_speed] self.observation = {
'observation': sensor_data['CameraRGB'].data,
'measurements': [measurements.player_measurements.forward_speed],
}
self.autopilot = measurements.player_measurements.autopilot_control self.autopilot = measurements.player_measurements.autopilot_control
# action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]] # action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]]

View File

@@ -135,8 +135,10 @@ class DoomEnvironmentWrapper(EnvironmentWrapper):
# extract all data from the current state # extract all data from the current state
state = self.game.get_state() state = self.game.get_state()
if state is not None and state.screen_buffer is not None: if state is not None and state.screen_buffer is not None:
self.observation = state.screen_buffer self.observation = {
self.measurements = state.game_variables 'observation': state.screen_buffer,
'measurements': state.game_variables,
}
self.reward = self.game.get_last_reward() self.reward = self.game.get_last_reward()
self.done = self.game.is_episode_finished() self.done = self.game.is_episode_finished()
@@ -157,5 +159,3 @@ class DoomEnvironmentWrapper(EnvironmentWrapper):
def _restart_environment_episode(self, force_environment_reset=False): def _restart_environment_episode(self, force_environment_reset=False):
self.game.new_episode() self.game.new_episode()

View File

@@ -31,14 +31,13 @@ class EnvironmentWrapper(object):
# env initialization # env initialization
self.game = [] self.game = []
self.actions = {} self.actions = {}
self.observation = [] self.state = []
self.reward = 0 self.reward = 0
self.done = False self.done = False
self.default_action = 0 self.default_action = 0
self.last_action_idx = 0 self.last_action_idx = 0
self.episode_idx = 0 self.episode_idx = 0
self.last_episode_time = time.time() self.last_episode_time = time.time()
self.measurements = []
self.info = [] self.info = []
self.action_space_low = 0 self.action_space_low = 0
self.action_space_high = 0 self.action_space_high = 0
@@ -65,6 +64,22 @@ class EnvironmentWrapper(object):
self.game_is_open = True self.game_is_open = True
self.renderer = Renderer() self.renderer = Renderer()
@property
def measurements(self):
assert False
@measurements.setter
def measurements(self, value):
assert False
@property
def observation(self):
assert False
@observation.setter
def observation(self, value):
assert False
def _idx_to_action(self, action_idx): def _idx_to_action(self, action_idx):
""" """
Convert an action index to one of the environment available actions. Convert an action index to one of the environment available actions.
@@ -116,7 +131,7 @@ class EnvironmentWrapper(object):
""" """
Perform a single step on the environment using the given action Perform a single step on the environment using the given action
:param action_idx: the action to perform on the environment :param action_idx: the action to perform on the environment
:return: A dictionary containing the observation, reward, done flag, action and measurements :return: A dictionary containing the state, reward, done flag and action
""" """
self.last_action_idx = action_idx self.last_action_idx = action_idx
@@ -127,13 +142,12 @@ class EnvironmentWrapper(object):
if self.is_rendered: if self.is_rendered:
self.render() self.render()
self.observation = self._preprocess_observation(self.observation) self.state = self._preprocess_state(self.state)
return {'observation': self.observation, return {'state': self.state,
'reward': self.reward, 'reward': self.reward,
'done': self.done, 'done': self.done,
'action': self.last_action_idx, 'action': self.last_action_idx,
'measurements': self.measurements,
'info': self.info} 'info': self.info}
def render(self): def render(self):
@@ -146,7 +160,7 @@ class EnvironmentWrapper(object):
""" """
Reset the environment and all the variable of the wrapper Reset the environment and all the variable of the wrapper
:param force_environment_reset: forces environment reset even when the game did not end :param force_environment_reset: forces environment reset even when the game did not end
:return: A dictionary containing the observation, reward, done flag, action and measurements :return: A dictionary containing the state, reward, done flag and action
""" """
self._restart_environment_episode(force_environment_reset) self._restart_environment_episode(force_environment_reset)
self.last_episode_time = time.time() self.last_episode_time = time.time()
@@ -156,17 +170,18 @@ class EnvironmentWrapper(object):
self.last_action_idx = 0 self.last_action_idx = 0
self._update_state() self._update_state()
# render before the preprocessing of the observation, so that the image will be in its original quality # render before the preprocessing of the state, so that the image will be in its original quality
if self.is_rendered: if self.is_rendered:
self.render() self.render()
self.observation = self._preprocess_observation(self.observation) # TODO BUG: if the environment has not been reset, _preprocessed_state will be running on an already preprocessed state
# TODO: see also _update_state above
self.state = self._preprocess_state(self.state)
return {'observation': self.observation, return {'state': self.state,
'reward': self.reward, 'reward': self.reward,
'done': self.done, 'done': self.done,
'action': self.last_action_idx, 'action': self.last_action_idx,
'measurements': self.measurements,
'info': self.info} 'info': self.info}
def get_random_action(self): def get_random_action(self):
@@ -216,19 +231,19 @@ class EnvironmentWrapper(object):
""" """
pass pass
def _preprocess_observation(self, observation): def _preprocess_state(self, state):
""" """
Do initial observation preprocessing such as cropping, rgb2gray, rescale etc. Do initial state preprocessing such as cropping, rgb2gray, rescale etc.
Implementing this function is optional. Implementing this function is optional.
:param observation: a raw observation from the environment :param state: a raw state from the environment
:return: the preprocessed observation :return: the preprocessed state
""" """
return observation return state
def _update_state(self): def _update_state(self):
""" """
Updates the state from the environment. Updates the state from the environment.
Should update self.observation, self.reward, self.done, self.measurements and self.info Should update self.state, self.reward, self.done and self.info
:return: None :return: None
""" """
pass pass
@@ -243,7 +258,8 @@ class EnvironmentWrapper(object):
def get_rendered_image(self): def get_rendered_image(self):
""" """
Return a numpy array containing the image that will be rendered to the screen. Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector. This can be different from the state. For example, mujoco's state is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen :return: numpy array containing the image that will be rendered to the screen
""" """
return self.observation # TODO: probably needs revisiting
return self.state

View File

@@ -60,7 +60,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
self.env.frameskip = self.frame_skip self.env.frameskip = self.frame_skip
self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box
self.observation = self.reset(True)['observation'] self.state = self.reset(True)['state']
# render # render
if self.is_rendered: if self.is_rendered:
@@ -70,12 +70,13 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
scale = 2 scale = 2
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale) self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
self.is_state_type_image = len(self.observation.shape) > 1 # TODO: collect and store this as observation space instead
self.is_state_type_image = len(self.state['observation'].shape) > 1
if self.is_state_type_image: if self.is_state_type_image:
self.width = self.observation.shape[1] self.width = self.state['observation'].shape[1]
self.height = self.observation.shape[0] self.height = self.state['observation'].shape[0]
else: else:
self.width = self.observation.shape[0] self.width = self.state['observation'].shape[0]
# action space # action space
self.actions_description = {} self.actions_description = {}
@@ -101,6 +102,12 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
self.timestep_limit = None self.timestep_limit = None
self.measurements_size = len(self.step(0)['info'].keys()) self.measurements_size = len(self.step(0)['info'].keys())
def _wrap_state(self, state):
if isinstance(self.env.observation_space, gym.spaces.Dict):
return state
else:
return {'observation': state}
def _update_state(self): def _update_state(self):
if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'): if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'): if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'):
@@ -131,28 +138,30 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
action = np.squeeze(action) action = np.squeeze(action)
action = np.clip(action, self.action_space_low, self.action_space_high) action = np.clip(action, self.action_space_low, self.action_space_high)
self.observation, self.reward, self.done, self.info = self.env.step(action) state, self.reward, self.done, self.info = self.env.step(action)
self.state = self._wrap_state(state)
def _preprocess_observation(self, observation): def _preprocess_state(self, state):
# TODO: move this into wrapper
if any(env in self.env_id for env in ["Breakout", "Pong"]): if any(env in self.env_id for env in ["Breakout", "Pong"]):
# crop image # crop image
observation = observation[34:195, :, :] state['observation'] = state['observation'][34:195, :, :]
return observation return state
def _restart_environment_episode(self, force_environment_reset=False): def _restart_environment_episode(self, force_environment_reset=False):
# prevent reset of environment if there are ale lives left # prevent reset of environment if there are ale lives left
if (hasattr(self.env, 'env') and hasattr(self.env.env, 'ale') and self.env.env.ale.lives() > 0) \ if (hasattr(self.env, 'env') and hasattr(self.env.env, 'ale') and self.env.env.ale.lives() > 0) \
and not force_environment_reset and not self.env._past_limit(): and not force_environment_reset and not self.env._past_limit():
return self.observation return self.state
if self.seed: if self.seed:
self.env.seed(self.seed) self.env.seed(self.seed)
self.observation = self.env.reset() self.state = self._wrap_state(self.env.reset())
while self.observation is None: while self.state is None:
self.step(0) self.step(0)
return self.observation return self.state
def get_rendered_image(self): def get_rendered_image(self):
return self.env.render(mode='rgb_array') return self.env.render(mode='rgb_array')

View File

@@ -250,13 +250,13 @@ class Logger(BaseLogger):
if 'Training Reward' in self.data.keys() and 'Evaluation Reward' in self.data.keys(): if 'Training Reward' in self.data.keys() and 'Evaluation Reward' in self.data.keys():
screen.log_title("Max training reward: {}, max evaluation reward: {}".format(self.data['Training Reward'].max(), self.data['Evaluation Reward'].max())) screen.log_title("Max training reward: {}, max evaluation reward: {}".format(self.data['Training Reward'].max(), self.data['Evaluation Reward'].max()))
screen.separator() screen.separator()
if screen.ask_yes_no("Do you want to discard the experiment results (Warning: this cannot be undone)?", False): # if screen.ask_yes_no("Do you want to discard the experiment results (Warning: this cannot be undone)?", False):
self.remove_experiment_dir() # self.remove_experiment_dir()
elif screen.ask_yes_no("Do you want to specify a different experiment name to save to?", False): # elif screen.ask_yes_no("Do you want to specify a different experiment name to save to?", False):
new_name = self.get_experiment_name() # new_name = self.get_experiment_name()
new_path = self.get_experiment_path(new_name, create_path=False) # new_path = self.get_experiment_path(new_name, create_path=False)
shutil.move(self.experiments_path, new_path) # shutil.move(self.experiments_path, new_path)
screen.log_title("Results moved to: {}".format(new_path)) # screen.log_title("Results moved to: {}".format(new_path))
def get_experiment_name(self, initial_experiment_name=''): def get_experiment_name(self, initial_experiment_name=''):
match = None match = None

View File

@@ -109,6 +109,7 @@ if __name__ == '__main__':
num_workers=preset.test_num_workers, num_workers=preset.test_num_workers,
log_file_name=log_file_name, log_file_name=log_file_name,
) )
print('cmd', cmd)
p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid) p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid)
# get the csv with the results # get the csv with the results