1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00

temp commit

This commit is contained in:
Zach Dwiel
2018-02-16 09:35:58 -05:00
parent 16c5032735
commit 85afb86893
14 changed files with 244 additions and 127 deletions

View File

@@ -20,6 +20,17 @@ from utils import *
import scipy.signal import scipy.signal
def last_sample(state):
"""
given a batch of states, return the last sample of the batch with length 1
batch axis.
"""
return {
k: np.expand_dims(v[-1], 0)
for k, v in state.items()
}
# Actor Critic - https://arxiv.org/abs/1602.01783 # Actor Critic - https://arxiv.org/abs/1602.01783
class ActorCriticAgent(PolicyOptimizationAgent): class ActorCriticAgent(PolicyOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
@@ -76,7 +87,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
if game_overs[-1]: if game_overs[-1]:
R = 0 R = 0
else: else:
R = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0] R = self.main_network.online_network.predict(last_sample(next_states))[0]
for i in reversed(range(num_transitions)): for i in reversed(range(num_transitions)):
R = rewards[i] + self.tp.agent.discount * R R = rewards[i] + self.tp.agent.discount * R
@@ -85,7 +96,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps # get bootstraps
bootstrapped_value = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0] bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0]
values = np.append(current_state_values, bootstrapped_value) values = np.append(current_state_values, bootstrapped_value)
if game_overs[-1]: if game_overs[-1]:
values[-1] = 0 values[-1] = 0
@@ -101,7 +112,9 @@ class ActorCriticAgent(PolicyOptimizationAgent):
actions = np.expand_dims(actions, -1) actions = np.expand_dims(actions, -1)
# train # train
result = self.main_network.online_network.accumulate_gradients([current_states, actions], inputs = copy.copy(current_states)
inputs['output_1_0'] = actions
result = self.main_network.online_network.accumulate_gradients(inputs,
[state_value_head_targets, action_advantages]) [state_value_head_targets, action_advantages])
# logging # logging
@@ -114,11 +127,17 @@ class ActorCriticAgent(PolicyOptimizationAgent):
return total_loss return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN): def choose_action(self, curr_state, phase=RunPhase.TRAIN):
# TODO: rename curr_state -> state
# convert to batch so we can run it through the network # convert to batch so we can run it through the network
observation = np.expand_dims(np.array(curr_state['observation']), 0) curr_state = {
k: np.expand_dims(np.array(curr_state[k]), 0)
for k in curr_state.keys()
}
if self.env.discrete_controls: if self.env.discrete_controls:
# DISCRETE # DISCRETE
state_value, action_probabilities = self.main_network.online_network.predict(observation) state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
action_probabilities = action_probabilities.squeeze() action_probabilities = action_probabilities.squeeze()
if phase == RunPhase.TRAIN: if phase == RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_probabilities) action = self.exploration_policy.get_action(action_probabilities)
@@ -128,7 +147,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps))) self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
else: else:
# CONTINUOUS # CONTINUOUS
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(observation) state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
action_values_mean = action_values_mean.squeeze() action_values_mean = action_values_mean.squeeze()
action_values_std = action_values_std.squeeze() action_values_std = action_values_std.squeeze()
if phase == RunPhase.TRAIN: if phase == RunPhase.TRAIN:

View File

@@ -93,7 +93,7 @@ class Agent(object):
self.running_reward = None self.running_reward = None
self.training_iteration = 0 self.training_iteration = 0
self.current_episode = self.tp.current_episode = 0 self.current_episode = self.tp.current_episode = 0
self.curr_state = [] self.curr_state = {}
self.current_episode_steps_counter = 0 self.current_episode_steps_counter = 0
self.episode_running_info = {} self.episode_running_info = {}
self.last_episode_evaluation_ran = 0 self.last_episode_evaluation_ran = 0
@@ -194,7 +194,7 @@ class Agent(object):
for signal in self.signals: for signal in self.signals:
signal.reset() signal.reset()
self.total_reward_in_current_episode = 0 self.total_reward_in_current_episode = 0
self.curr_state = [] self.curr_state = {}
self.last_episode_images = [] self.last_episode_images = []
self.current_episode_steps_counter = 0 self.current_episode_steps_counter = 0
self.episode_running_info = {} self.episode_running_info = {}
@@ -289,23 +289,20 @@ class Agent(object):
:param batch: An array of transitions :param batch: An array of transitions
:return: For each transition element, returns a numpy array of all the transitions in the batch :return: For each transition element, returns a numpy array of all the transitions in the batch
""" """
current_states = {}
next_states = {}
current_observations = np.array([transition.state['observation'] for transition in batch]) current_states['observation'] = np.array([transition.state['observation'] for transition in batch])
next_observations = np.array([transition.next_state['observation'] for transition in batch]) next_states['observation'] = np.array([transition.next_state['observation'] for transition in batch])
actions = np.array([transition.action for transition in batch]) actions = np.array([transition.action for transition in batch])
rewards = np.array([transition.reward for transition in batch]) rewards = np.array([transition.reward for transition in batch])
game_overs = np.array([transition.game_over for transition in batch]) game_overs = np.array([transition.game_over for transition in batch])
total_return = np.array([transition.total_return for transition in batch]) total_return = np.array([transition.total_return for transition in batch])
current_states = current_observations
next_states = next_observations
# get the entire state including measurements if available # get the entire state including measurements if available
if self.tp.agent.use_measurements: if self.tp.agent.use_measurements:
current_measurements = np.array([transition.state['measurements'] for transition in batch]) current_states['measurements'] = np.array([transition.state['measurements'] for transition in batch])
next_measurements = np.array([transition.next_state['measurements'] for transition in batch]) next_states['measurements'] = np.array([transition.next_state['measurements'] for transition in batch])
current_states = [current_observations, current_measurements]
next_states = [next_observations, next_measurements]
return current_states, next_states, actions, rewards, game_overs, total_return return current_states, next_states, actions, rewards, game_overs, total_return
@@ -353,12 +350,24 @@ class Agent(object):
# get new action # get new action
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0} action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0}
is_first_transition_in_episode = (self.curr_state == []) is_first_transition_in_episode = (self.curr_state == {})
if is_first_transition_in_episode: if is_first_transition_in_episode:
observation = self.preprocess_observation(self.env.observation) if not isinstance(self.env.state, dict):
observation = stack_observation([], observation, self.tp.env.observation_stack_size) raise ValueError((
'expected state to be a dictionary, found {}'
).format(type(self.env.state)))
self.curr_state = {'observation': observation} state = self.env.state
# TODO: modify preprocess_observation to modify the entire state
# for now, only preprocess the observation
state['observation'] = self.preprocess_observation(state['observation'])
# TODO: provide option to stack more than just the observation
# TODO: this should probably be happening in an environment wrapper anyway
state['observation'] = stack_observation([], state['observation'], self.tp.env.observation_stack_size)
self.curr_state = state
# TODO: this should be handled in the environment
if self.tp.agent.use_measurements: if self.tp.agent.use_measurements:
self.curr_state['measurements'] = self.env.measurements self.curr_state['measurements'] = self.env.measurements
if self.tp.agent.use_accumulated_reward_as_measurement: if self.tp.agent.use_accumulated_reward_as_measurement:
@@ -373,22 +382,25 @@ class Agent(object):
if type(action) == np.ndarray: if type(action) == np.ndarray:
action = action.squeeze() action = action.squeeze()
result = self.env.step(action) result = self.env.step(action)
shaped_reward = self.preprocess_reward(result['reward']) shaped_reward = self.preprocess_reward(result['reward'])
if 'action_intrinsic_reward' in action_info.keys(): if 'action_intrinsic_reward' in action_info.keys():
shaped_reward += action_info['action_intrinsic_reward'] shaped_reward += action_info['action_intrinsic_reward']
# TODO: should total_reward_in_current_episode include shaped_reward?
self.total_reward_in_current_episode += result['reward'] self.total_reward_in_current_episode += result['reward']
observation = self.preprocess_observation(result['observation']) next_state = result['state']
next_state['observation'] = self.preprocess_observation(next_state['observation'])
# plot action values online # plot action values online
if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
self.plot_action_values_online() self.plot_action_values_online()
# initialize the next state # initialize the next state
observation = stack_observation(self.curr_state['observation'], observation, self.tp.env.observation_stack_size) # TODO: provide option to stack more than just the observation
next_state['observation'] = stack_observation(self.curr_state['observation'], next_state['observation'], self.tp.env.observation_stack_size)
next_state = {'observation': observation}
if self.tp.agent.use_measurements and 'measurements' in result.keys(): if self.tp.agent.use_measurements and 'measurements' in result.keys():
next_state['measurements'] = result['measurements'] next_state['measurements'] = result['state']['measurements']
if self.tp.agent.use_accumulated_reward_as_measurement: if self.tp.agent.use_accumulated_reward_as_measurement:
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)

View File

@@ -1,5 +1,5 @@
# #
# Copyright (c) 2017 Intel Corporation # Copyright (c) 2017 Intel Corporation
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.

View File

@@ -48,7 +48,7 @@ class TensorFlowArchitecture(Architecture):
self.network_is_local = network_is_local self.network_is_local = network_is_local
assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent' assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent'
self.sess = tuning_parameters.sess self.sess = tuning_parameters.sess
self.inputs = [] self.inputs = {}
self.outputs = [] self.outputs = []
self.targets = [] self.targets = []
self.losses = [] self.losses = []
@@ -106,7 +106,8 @@ class TensorFlowArchitecture(Architecture):
# gradients of the outputs w.r.t. the inputs # gradients of the outputs w.r.t. the inputs
# at the moment, this is only used by ddpg # at the moment, this is only used by ddpg
if len(self.outputs) == 1: if len(self.outputs) == 1:
self.gradients_wrt_inputs = [tf.gradients(self.outputs[0], input_ph) for input_ph in self.inputs] # TODO: convert gradients_with_respect_to_inputs into dictionary?
self.gradients_wrt_inputs = [tf.gradients(self.outputs[0], input_ph) for input_ph in self.inputs.values()]
self.gradients_weights_ph = tf.placeholder('float32', self.outputs[0].shape, 'output_gradient_weights') self.gradients_weights_ph = tf.placeholder('float32', self.outputs[0].shape, 'output_gradient_weights')
self.weighted_gradients = tf.gradients(self.outputs[0], self.trainable_weights, self.gradients_weights_ph) self.weighted_gradients = tf.gradients(self.outputs[0], self.trainable_weights, self.gradients_weights_ph)
@@ -169,9 +170,8 @@ class TensorFlowArchitecture(Architecture):
# feed inputs # feed inputs
if additional_fetches is None: if additional_fetches is None:
additional_fetches = [] additional_fetches = []
inputs = force_list(inputs)
feed_dict = dict(zip(self.inputs, inputs)) feed_dict = self._feed_dict(inputs)
# feed targets # feed targets
targets = force_list(targets) targets = force_list(targets)
@@ -266,6 +266,12 @@ class TensorFlowArchitecture(Architecture):
while self.tp.sess.run(self.release_counter) % self.tp.num_threads != 0: while self.tp.sess.run(self.release_counter) % self.tp.num_threads != 0:
time.sleep(0.00001) time.sleep(0.00001)
def _feed_dict(self, inputs):
return {
self.inputs[input_name]: input_value
for input_name, input_value in inputs.items()
}
def predict(self, inputs, outputs=None): def predict(self, inputs, outputs=None):
""" """
Run a forward pass of the network using the given input Run a forward pass of the network using the given input
@@ -275,8 +281,8 @@ class TensorFlowArchitecture(Architecture):
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step. WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
""" """
# TODO: rename self.inputs -> self.input_placeholders
feed_dict = dict(zip(self.inputs, force_list(inputs))) feed_dict = self._feed_dict(inputs)
if outputs is None: if outputs is None:
outputs = self.outputs outputs = self.outputs
@@ -290,21 +296,21 @@ class TensorFlowArchitecture(Architecture):
return squeeze_list(output) return squeeze_list(output)
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None): # def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None):
""" # """
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients # Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
:param additional_fetches: Optional tensors to fetch during the training process # :param additional_fetches: Optional tensors to fetch during the training process
:param inputs: The input for the network # :param inputs: The input for the network
:param targets: The targets corresponding to the input batch # :param targets: The targets corresponding to the input batch
:param scaler: A scaling factor that allows rescaling the gradients before applying them # :param scaler: A scaling factor that allows rescaling the gradients before applying them
:return: The loss of the network # :return: The loss of the network
""" # """
if additional_fetches is None: # if additional_fetches is None:
additional_fetches = [] # additional_fetches = []
force_list(additional_fetches) # force_list(additional_fetches)
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches) # loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches)
self.apply_and_reset_gradients(self.accumulated_gradients, scaler) # self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
return loss # return loss
def get_weights(self): def get_weights(self):
""" """

View File

@@ -112,7 +112,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
#################### ####################
state_embedding = [] state_embedding = []
for idx, input_type in enumerate(self.tp.agent.input_types): for input_name, input_type in self.tp.agent.input_types.items():
# get the class of the input embedder # get the class of the input embedder
input_embedder = self.get_input_embedder(input_type) input_embedder = self.get_input_embedder(input_type)
self.input_embedders.append(input_embedder) self.input_embedders.append(input_embedder)
@@ -122,9 +122,9 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
# the existing input_placeholders into the input_embedders. # the existing input_placeholders into the input_embedders.
if network_idx == 0: if network_idx == 0:
input_placeholder, embedding = input_embedder() input_placeholder, embedding = input_embedder()
self.inputs.append(input_placeholder) self.inputs[input_name] = input_placeholder
else: else:
input_placeholder, embedding = input_embedder(self.inputs[idx]) input_placeholder, embedding = input_embedder(self.inputs[input_name])
state_embedding.append(embedding) state_embedding.append(embedding)
@@ -159,13 +159,15 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
# build the head # build the head
if self.network_is_local: if self.network_is_local:
output, target_placeholder, input_placeholder = self.output_heads[-1](head_input) output, target_placeholder, input_placeholders = self.output_heads[-1](head_input)
self.targets.extend(target_placeholder) self.targets.extend(target_placeholder)
else: else:
output, input_placeholder = self.output_heads[-1](head_input) output, input_placeholders = self.output_heads[-1](head_input)
self.outputs.extend(output) self.outputs.extend(output)
self.inputs.extend(input_placeholder) # TODO: use head names as well
for placeholder_index, input_placeholder in enumerate(input_placeholders):
self.inputs['output_{}_{}'.format(head_idx, placeholder_index)] = input_placeholder
# Losses # Losses
self.losses = tf.losses.get_losses(self.name) self.losses = tf.losses.get_losses(self.name)

View File

@@ -250,7 +250,7 @@ class MeasurementsPredictionHead(Head):
name='output') name='output')
action_stream = tf.reshape(action_stream, action_stream = tf.reshape(action_stream,
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size)) (tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keep_dims=True) action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
# merge to future measurements predictions # merge to future measurements predictions
self.output = tf.add(expectation_stream, action_stream, name='output') self.output = tf.add(expectation_stream, action_stream, name='output')
@@ -302,7 +302,7 @@ class DNDQHead(Head):
square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1)) square_diff = tf.square(dnd_embeddings - tf.expand_dims(input_layer, 1))
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta] distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
weights = 1.0 / distances weights = 1.0 / distances
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True) normalised_weights = weights / tf.reduce_sum(weights, axis=1, keepdims=True)
return tf.reduce_sum(dnd_values * normalised_weights, axis=1) return tf.reduce_sum(dnd_values * normalised_weights, axis=1)

View File

@@ -30,6 +30,14 @@ from subprocess import Popen
import datetime import datetime
import presets import presets
import atexit import atexit
import sys
import subprocess
from threading import Thread
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty # for Python 3.x
if len(set(failed_imports)) > 0: if len(set(failed_imports)) > 0:
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
@@ -152,6 +160,38 @@ def run_dict_to_json(_run_dict, task_id=''):
return json_path return json_path
def enqueue_output(out, queue):
for line in iter(out.readline, b''):
queue.put(line)
out.close()
def merge_streams(processes, output_stream=sys.stdout):
q = Queue()
threads = []
for p in processes:
threads.append(Thread(target=enqueue_output, args=(p.stdout, q)))
threads.append(Thread(target=enqueue_output, args=(p.stderr, q)))
for t in threads:
t.daemon = True
t.start()
while True:
try:
line = q.get_nowait()
except Empty:
# break when all processes are done and q is empty
if all(p.poll() is not None for p in processes):
break
else:
# sys.stdout.write(line)
output_stream.write(line.decode(output_stream.encoding))
output_stream.flush()
print('All processes done')
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-p', '--preset', parser.add_argument('-p', '--preset',
@@ -252,6 +292,8 @@ if __name__ == "__main__":
if not args.no_summary: if not args.no_summary:
atexit.register(logger.print_summary) atexit.register(logger.print_summary)
set_cpu()
# Single-threaded runs # Single-threaded runs
if run_dict['num_threads'] == 1: if run_dict['num_threads'] == 1:
# set tuning parameters # set tuning parameters
@@ -285,11 +327,13 @@ if __name__ == "__main__":
set_cpu() set_cpu()
# create a parameter server # create a parameter server
Popen(["python3", parameter_server = Popen([
"./parallel_actor.py", "python3",
"--ps_hosts={}".format(ps_hosts), "./parallel_actor.py",
"--worker_hosts={}".format(worker_hosts), "--ps_hosts={}".format(ps_hosts),
"--job_name=ps"]) "--worker_hosts={}".format(worker_hosts),
"--job_name=ps",
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
screen.log_title("*** Distributed Training ***") screen.log_title("*** Distributed Training ***")
time.sleep(1) time.sleep(1)
@@ -314,13 +358,15 @@ if __name__ == "__main__":
"--job_name=worker", "--job_name=worker",
"--load_json={}".format(json_run_dict_path)] "--load_json={}".format(json_run_dict_path)]
p = Popen(workers_args) p = Popen(workers_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
if i != run_dict['num_threads']: if i != run_dict['num_threads']:
workers.append(p) workers.append(p)
else: else:
evaluation_worker = p evaluation_worker = p
merge_streams(workers + [parameter_server])
# wait for all workers # wait for all workers
[w.wait() for w in workers] [w.wait() for w in workers]
evaluation_worker.kill() evaluation_worker.kill()

View File

@@ -69,7 +69,7 @@ class Parameters(object):
parameters[k] = dict(v.items()) parameters[k] = dict(v.items())
else: else:
parameters[k] = v parameters[k] = v
return json.dumps(parameters, indent=4, default=repr) return json.dumps(parameters, indent=4, default=repr)
@@ -77,7 +77,7 @@ class AgentParameters(Parameters):
agent = '' agent = ''
# Architecture parameters # Architecture parameters
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
middleware_type = MiddlewareTypes.FC middleware_type = MiddlewareTypes.FC
loss_weights = [1.0] loss_weights = [1.0]
@@ -327,7 +327,7 @@ class Human(AgentParameters):
class NStepQ(AgentParameters): class NStepQ(AgentParameters):
type = 'NStepQAgent' type = 'NStepQAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
optimizer_type = 'Adam' optimizer_type = 'Adam'
@@ -343,7 +343,7 @@ class NStepQ(AgentParameters):
class DQN(AgentParameters): class DQN(AgentParameters):
type = 'DQNAgent' type = 'DQNAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
optimizer_type = 'Adam' optimizer_type = 'Adam'
@@ -385,7 +385,7 @@ class QuantileRegressionDQN(DQN):
class NEC(AgentParameters): class NEC(AgentParameters):
type = 'NECAgent' type = 'NECAgent'
optimizer_type = 'RMSProp' optimizer_type = 'RMSProp'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.DNDQ] output_types = [OutputTypes.DNDQ]
loss_weights = [1.0] loss_weights = [1.0]
dnd_size = 500000 dnd_size = 500000
@@ -399,7 +399,7 @@ class NEC(AgentParameters):
class ActorCritic(AgentParameters): class ActorCritic(AgentParameters):
type = 'ActorCriticAgent' type = 'ActorCriticAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.Pi] output_types = [OutputTypes.V, OutputTypes.Pi]
loss_weights = [0.5, 1.0] loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False] stop_gradients_from_head = [False, False]
@@ -417,7 +417,7 @@ class ActorCritic(AgentParameters):
class PolicyGradient(AgentParameters): class PolicyGradient(AgentParameters):
type = 'PolicyGradientsAgent' type = 'PolicyGradientsAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Pi] output_types = [OutputTypes.Pi]
loss_weights = [1.0] loss_weights = [1.0]
num_episodes_in_experience_replay = 2 num_episodes_in_experience_replay = 2
@@ -430,7 +430,7 @@ class PolicyGradient(AgentParameters):
class DDPG(AgentParameters): class DDPG(AgentParameters):
type = 'DDPGAgent' type = 'DDPGAgent'
input_types = [InputTypes.Observation, InputTypes.Action] input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'relu' hidden_layers_activation_function = 'relu'
@@ -443,7 +443,7 @@ class DDPG(AgentParameters):
class DDDPG(AgentParameters): class DDDPG(AgentParameters):
type = 'DDPGAgent' type = 'DDPGAgent'
input_types = [InputTypes.Observation, InputTypes.Action] input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'relu' hidden_layers_activation_function = 'relu'
@@ -456,7 +456,7 @@ class DDDPG(AgentParameters):
class NAF(AgentParameters): class NAF(AgentParameters):
type = 'NAFAgent' type = 'NAFAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.NAF] output_types = [OutputTypes.NAF]
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'tanh' hidden_layers_activation_function = 'tanh'
@@ -469,7 +469,7 @@ class NAF(AgentParameters):
class PPO(AgentParameters): class PPO(AgentParameters):
type = 'PPOAgent' type = 'PPOAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V] output_types = [OutputTypes.V]
loss_weights = [1.0] loss_weights = [1.0]
hidden_layers_activation_function = 'tanh' hidden_layers_activation_function = 'tanh'
@@ -489,7 +489,7 @@ class PPO(AgentParameters):
class ClippedPPO(AgentParameters): class ClippedPPO(AgentParameters):
type = 'ClippedPPOAgent' type = 'ClippedPPOAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.PPO] output_types = [OutputTypes.V, OutputTypes.PPO]
loss_weights = [0.5, 1.0] loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False] stop_gradients_from_head = [False, False]
@@ -515,7 +515,11 @@ class ClippedPPO(AgentParameters):
class DFP(AgentParameters): class DFP(AgentParameters):
type = 'DFPAgent' type = 'DFPAgent'
input_types = [InputTypes.Observation, InputTypes.Measurements, InputTypes.GoalVector] input_types = {
'observation': InputTypes.Observation,
'measurements': InputTypes.Measurements,
'goal': InputTypes.GoalVector
}
output_types = [OutputTypes.MeasurementsPrediction] output_types = [OutputTypes.MeasurementsPrediction]
loss_weights = [1.0] loss_weights = [1.0]
use_measurements = True use_measurements = True
@@ -527,7 +531,7 @@ class DFP(AgentParameters):
class MMC(AgentParameters): class MMC(AgentParameters):
type = 'MixedMonteCarloAgent' type = 'MixedMonteCarloAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
num_steps_between_copying_online_weights_to_target = 1000 num_steps_between_copying_online_weights_to_target = 1000
@@ -537,7 +541,7 @@ class MMC(AgentParameters):
class PAL(AgentParameters): class PAL(AgentParameters):
type = 'PALAgent' type = 'PALAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
pal_alpha = 0.9 pal_alpha = 0.9
@@ -548,7 +552,7 @@ class PAL(AgentParameters):
class BC(AgentParameters): class BC(AgentParameters):
type = 'BCAgent' type = 'BCAgent'
input_types = [InputTypes.Observation] input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q] output_types = [OutputTypes.Q]
loss_weights = [1.0] loss_weights = [1.0]
collect_new_data = False collect_new_data = False

View File

@@ -161,7 +161,6 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
measurements = [] measurements = []
while type(measurements) == list: while type(measurements) == list:
measurements, sensor_data = self.game.read_data() measurements, sensor_data = self.game.read_data()
self.observation = sensor_data['CameraRGB'].data
self.location = (measurements.player_measurements.transform.location.x, self.location = (measurements.player_measurements.transform.location.x,
measurements.player_measurements.transform.location.y, measurements.player_measurements.transform.location.y,
@@ -181,7 +180,10 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
- np.abs(self.control.steer) * 10 - np.abs(self.control.steer) * 10
# update measurements # update measurements
self.measurements = [measurements.player_measurements.forward_speed] self.observation = {
'observation': sensor_data['CameraRGB'].data,
'measurements': [measurements.player_measurements.forward_speed],
}
self.autopilot = measurements.player_measurements.autopilot_control self.autopilot = measurements.player_measurements.autopilot_control
# action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]] # action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]]

View File

@@ -1,5 +1,5 @@
# #
# Copyright (c) 2017 Intel Corporation # Copyright (c) 2017 Intel Corporation
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@@ -135,8 +135,10 @@ class DoomEnvironmentWrapper(EnvironmentWrapper):
# extract all data from the current state # extract all data from the current state
state = self.game.get_state() state = self.game.get_state()
if state is not None and state.screen_buffer is not None: if state is not None and state.screen_buffer is not None:
self.observation = state.screen_buffer self.observation = {
self.measurements = state.game_variables 'observation': state.screen_buffer,
'measurements': state.game_variables,
}
self.reward = self.game.get_last_reward() self.reward = self.game.get_last_reward()
self.done = self.game.is_episode_finished() self.done = self.game.is_episode_finished()
@@ -157,5 +159,3 @@ class DoomEnvironmentWrapper(EnvironmentWrapper):
def _restart_environment_episode(self, force_environment_reset=False): def _restart_environment_episode(self, force_environment_reset=False):
self.game.new_episode() self.game.new_episode()

View File

@@ -1,5 +1,5 @@
# #
# Copyright (c) 2017 Intel Corporation # Copyright (c) 2017 Intel Corporation
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@@ -31,14 +31,13 @@ class EnvironmentWrapper(object):
# env initialization # env initialization
self.game = [] self.game = []
self.actions = {} self.actions = {}
self.observation = [] self.state = []
self.reward = 0 self.reward = 0
self.done = False self.done = False
self.default_action = 0 self.default_action = 0
self.last_action_idx = 0 self.last_action_idx = 0
self.episode_idx = 0 self.episode_idx = 0
self.last_episode_time = time.time() self.last_episode_time = time.time()
self.measurements = []
self.info = [] self.info = []
self.action_space_low = 0 self.action_space_low = 0
self.action_space_high = 0 self.action_space_high = 0
@@ -65,6 +64,22 @@ class EnvironmentWrapper(object):
self.game_is_open = True self.game_is_open = True
self.renderer = Renderer() self.renderer = Renderer()
@property
def measurements(self):
assert False
@measurements.setter
def measurements(self, value):
assert False
@property
def observation(self):
assert False
@observation.setter
def observation(self, value):
assert False
def _idx_to_action(self, action_idx): def _idx_to_action(self, action_idx):
""" """
Convert an action index to one of the environment available actions. Convert an action index to one of the environment available actions.
@@ -108,7 +123,7 @@ class EnvironmentWrapper(object):
for env_keys in self.key_to_action.keys(): for env_keys in self.key_to_action.keys():
if set(env_keys) == set(self.renderer.pressed_keys): if set(env_keys) == set(self.renderer.pressed_keys):
return self.key_to_action[env_keys] return self.key_to_action[env_keys]
# return the default action 0 so that the environment will continue running # return the default action 0 so that the environment will continue running
return self.default_action return self.default_action
@@ -116,7 +131,7 @@ class EnvironmentWrapper(object):
""" """
Perform a single step on the environment using the given action Perform a single step on the environment using the given action
:param action_idx: the action to perform on the environment :param action_idx: the action to perform on the environment
:return: A dictionary containing the observation, reward, done flag, action and measurements :return: A dictionary containing the state, reward, done flag and action
""" """
self.last_action_idx = action_idx self.last_action_idx = action_idx
@@ -127,13 +142,12 @@ class EnvironmentWrapper(object):
if self.is_rendered: if self.is_rendered:
self.render() self.render()
self.observation = self._preprocess_observation(self.observation) self.state = self._preprocess_state(self.state)
return {'observation': self.observation, return {'state': self.state,
'reward': self.reward, 'reward': self.reward,
'done': self.done, 'done': self.done,
'action': self.last_action_idx, 'action': self.last_action_idx,
'measurements': self.measurements,
'info': self.info} 'info': self.info}
def render(self): def render(self):
@@ -146,7 +160,7 @@ class EnvironmentWrapper(object):
""" """
Reset the environment and all the variable of the wrapper Reset the environment and all the variable of the wrapper
:param force_environment_reset: forces environment reset even when the game did not end :param force_environment_reset: forces environment reset even when the game did not end
:return: A dictionary containing the observation, reward, done flag, action and measurements :return: A dictionary containing the state, reward, done flag and action
""" """
self._restart_environment_episode(force_environment_reset) self._restart_environment_episode(force_environment_reset)
self.last_episode_time = time.time() self.last_episode_time = time.time()
@@ -156,17 +170,18 @@ class EnvironmentWrapper(object):
self.last_action_idx = 0 self.last_action_idx = 0
self._update_state() self._update_state()
# render before the preprocessing of the observation, so that the image will be in its original quality # render before the preprocessing of the state, so that the image will be in its original quality
if self.is_rendered: if self.is_rendered:
self.render() self.render()
self.observation = self._preprocess_observation(self.observation) # TODO BUG: if the environment has not been reset, _preprocessed_state will be running on an already preprocessed state
# TODO: see also _update_state above
self.state = self._preprocess_state(self.state)
return {'observation': self.observation, return {'state': self.state,
'reward': self.reward, 'reward': self.reward,
'done': self.done, 'done': self.done,
'action': self.last_action_idx, 'action': self.last_action_idx,
'measurements': self.measurements,
'info': self.info} 'info': self.info}
def get_random_action(self): def get_random_action(self):
@@ -181,7 +196,7 @@ class EnvironmentWrapper(object):
def change_phase(self, phase): def change_phase(self, phase):
""" """
Change the current phase of the run. Change the current phase of the run.
This is useful when different behavior is expected when testing and training This is useful when different behavior is expected when testing and training
:param phase: The running phase of the algorithm :param phase: The running phase of the algorithm
:type phase: RunPhase :type phase: RunPhase
@@ -216,19 +231,19 @@ class EnvironmentWrapper(object):
""" """
pass pass
def _preprocess_observation(self, observation): def _preprocess_state(self, state):
""" """
Do initial observation preprocessing such as cropping, rgb2gray, rescale etc. Do initial state preprocessing such as cropping, rgb2gray, rescale etc.
Implementing this function is optional. Implementing this function is optional.
:param observation: a raw observation from the environment :param state: a raw state from the environment
:return: the preprocessed observation :return: the preprocessed state
""" """
return observation return state
def _update_state(self): def _update_state(self):
""" """
Updates the state from the environment. Updates the state from the environment.
Should update self.observation, self.reward, self.done, self.measurements and self.info Should update self.state, self.reward, self.done and self.info
:return: None :return: None
""" """
pass pass
@@ -243,7 +258,8 @@ class EnvironmentWrapper(object):
def get_rendered_image(self): def get_rendered_image(self):
""" """
Return a numpy array containing the image that will be rendered to the screen. Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector. This can be different from the state. For example, mujoco's state is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen :return: numpy array containing the image that will be rendered to the screen
""" """
return self.observation # TODO: probably needs revisiting
return self.state

View File

@@ -60,7 +60,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
self.env.frameskip = self.frame_skip self.env.frameskip = self.frame_skip
self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box self.discrete_controls = type(self.env.action_space) != gym.spaces.box.Box
self.observation = self.reset(True)['observation'] self.state = self.reset(True)['state']
# render # render
if self.is_rendered: if self.is_rendered:
@@ -70,12 +70,13 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
scale = 2 scale = 2
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale) self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
self.is_state_type_image = len(self.observation.shape) > 1 # TODO: collect and store this as observation space instead
self.is_state_type_image = len(self.state['observation'].shape) > 1
if self.is_state_type_image: if self.is_state_type_image:
self.width = self.observation.shape[1] self.width = self.state['observation'].shape[1]
self.height = self.observation.shape[0] self.height = self.state['observation'].shape[0]
else: else:
self.width = self.observation.shape[0] self.width = self.state['observation'].shape[0]
# action space # action space
self.actions_description = {} self.actions_description = {}
@@ -101,6 +102,12 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
self.timestep_limit = None self.timestep_limit = None
self.measurements_size = len(self.step(0)['info'].keys()) self.measurements_size = len(self.step(0)['info'].keys())
def _wrap_state(self, state):
if isinstance(self.env.observation_space, gym.spaces.Dict):
return state
else:
return {'observation': state}
def _update_state(self): def _update_state(self):
if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'): if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'): if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'):
@@ -131,28 +138,30 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
action = np.squeeze(action) action = np.squeeze(action)
action = np.clip(action, self.action_space_low, self.action_space_high) action = np.clip(action, self.action_space_low, self.action_space_high)
self.observation, self.reward, self.done, self.info = self.env.step(action) state, self.reward, self.done, self.info = self.env.step(action)
self.state = self._wrap_state(state)
def _preprocess_observation(self, observation): def _preprocess_state(self, state):
# TODO: move this into wrapper
if any(env in self.env_id for env in ["Breakout", "Pong"]): if any(env in self.env_id for env in ["Breakout", "Pong"]):
# crop image # crop image
observation = observation[34:195, :, :] state['observation'] = state['observation'][34:195, :, :]
return observation return state
def _restart_environment_episode(self, force_environment_reset=False): def _restart_environment_episode(self, force_environment_reset=False):
# prevent reset of environment if there are ale lives left # prevent reset of environment if there are ale lives left
if (hasattr(self.env, 'env') and hasattr(self.env.env, 'ale') and self.env.env.ale.lives() > 0) \ if (hasattr(self.env, 'env') and hasattr(self.env.env, 'ale') and self.env.env.ale.lives() > 0) \
and not force_environment_reset and not self.env._past_limit(): and not force_environment_reset and not self.env._past_limit():
return self.observation return self.state
if self.seed: if self.seed:
self.env.seed(self.seed) self.env.seed(self.seed)
self.observation = self.env.reset() self.state = self._wrap_state(self.env.reset())
while self.observation is None: while self.state is None:
self.step(0) self.step(0)
return self.observation return self.state
def get_rendered_image(self): def get_rendered_image(self):
return self.env.render(mode='rgb_array') return self.env.render(mode='rgb_array')

View File

@@ -1,5 +1,5 @@
# #
# Copyright (c) 2017 Intel Corporation # Copyright (c) 2017 Intel Corporation
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@@ -250,13 +250,13 @@ class Logger(BaseLogger):
if 'Training Reward' in self.data.keys() and 'Evaluation Reward' in self.data.keys(): if 'Training Reward' in self.data.keys() and 'Evaluation Reward' in self.data.keys():
screen.log_title("Max training reward: {}, max evaluation reward: {}".format(self.data['Training Reward'].max(), self.data['Evaluation Reward'].max())) screen.log_title("Max training reward: {}, max evaluation reward: {}".format(self.data['Training Reward'].max(), self.data['Evaluation Reward'].max()))
screen.separator() screen.separator()
if screen.ask_yes_no("Do you want to discard the experiment results (Warning: this cannot be undone)?", False): # if screen.ask_yes_no("Do you want to discard the experiment results (Warning: this cannot be undone)?", False):
self.remove_experiment_dir() # self.remove_experiment_dir()
elif screen.ask_yes_no("Do you want to specify a different experiment name to save to?", False): # elif screen.ask_yes_no("Do you want to specify a different experiment name to save to?", False):
new_name = self.get_experiment_name() # new_name = self.get_experiment_name()
new_path = self.get_experiment_path(new_name, create_path=False) # new_path = self.get_experiment_path(new_name, create_path=False)
shutil.move(self.experiments_path, new_path) # shutil.move(self.experiments_path, new_path)
screen.log_title("Results moved to: {}".format(new_path)) # screen.log_title("Results moved to: {}".format(new_path))
def get_experiment_name(self, initial_experiment_name=''): def get_experiment_name(self, initial_experiment_name=''):
match = None match = None

View File

@@ -109,6 +109,7 @@ if __name__ == '__main__':
num_workers=preset.test_num_workers, num_workers=preset.test_num_workers,
log_file_name=log_file_name, log_file_name=log_file_name,
) )
print('cmd', cmd)
p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid) p = subprocess.Popen(cmd, shell=True, executable="/bin/bash", preexec_fn=os.setsid)
# get the csv with the results # get the csv with the results