1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 11:40:18 +01:00

pre-release 0.10.0

This commit is contained in:
Gal Novik
2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions

View File

@@ -0,0 +1,664 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
from typing import List
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import force_list, squeeze_list
from rl_coach.architectures.architecture import Architecture
from rl_coach.core_types import GradientClippingMethod
def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
layers = [input_layer]
# batchnorm
if batchnorm:
layers.append(
tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
)
# activation
if activation_function:
layers.append(
activation_function(layers[-1], name="activation{}".format(layer_idx))
)
# dropout
if dropout:
layers.append(
tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
)
# remove the input layer from the layers list
del layers[0]
return layers
class Conv2d(object):
def __init__(self, params: List):
"""
:param params: list of [num_filters, kernel_size, strides]
"""
self.params = params
def __call__(self, input_layer, name: str):
"""
returns a tensorflow conv2d layer
:param input_layer: previous layer
:param name: layer name
:return: conv2d layer
"""
return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
data_format='channels_last', name=name)
class Dense(object):
def __init__(self, params: List):
"""
:param params: list of [num_output_neurons]
"""
self.params = params
def __call__(self, input_layer, name: str):
"""
returns a tensorflow dense layer
:param input_layer: previous layer
:param name: layer name
:return: dense layer
"""
return tf.layers.dense(input_layer, self.params[0], name=name)
def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope('summaries'):
layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
with tf.name_scope(layer_weight_name):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev', stddev)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
def local_getter(getter, name, *args, **kwargs):
"""
This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
instead of the global variables collection. The local variables collection will hold variables which are not shared
between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
these variables), but we can calculate the gradients wrt these variables, and we can update their content.
"""
kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
return getter(name, *args, **kwargs)
class TensorFlowArchitecture(Architecture):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
"""
:param agent_parameters: the agent parameters
:param spaces: the spaces definition of the agent
:param name: the name of the network
:param global_network: the global network replica that is shared between all the workers
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
:param network_is_trainable: is the network trainable (we can apply gradients on it)
"""
super().__init__(agent_parameters, spaces, name)
self.middleware = None
self.network_is_local = network_is_local
self.global_network = global_network
if not self.network_parameters.tensorflow_support:
raise ValueError('TensorFlow is not supported for this agent')
self.sess = None
self.inputs = {}
self.outputs = []
self.targets = []
self.importance_weights = []
self.losses = []
self.total_loss = None
self.trainable_weights = []
self.weights_placeholders = []
self.shared_accumulated_gradients = []
self.curr_rnn_c_in = None
self.curr_rnn_h_in = None
self.gradients_wrt_inputs = []
self.train_writer = None
self.accumulated_gradients = None
self.network_is_trainable = network_is_trainable
self.is_chief = self.ap.task_parameters.task_index == 0
self.network_is_global = not self.network_is_local and global_network is None
self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
self.optimizer_type = self.network_parameters.optimizer_type
if self.ap.task_parameters.seed is not None:
tf.set_random_seed(self.ap.task_parameters.seed)
with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
custom_getter=local_getter if network_is_local and global_network else None):
self.global_step = tf.train.get_or_create_global_step()
# build the network
self.get_model()
# model weights
self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
# create the placeholder for the assigning gradients and some tensorboard summaries for the weights
for idx, var in enumerate(self.weights):
placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
self.weights_placeholders.append(placeholder)
if self.ap.visualization.tensorboard:
variable_summaries(var)
# create op for assigning a list of weights to the network weights
self.update_weights_from_list = [weights.assign(holder) for holder, weights in
zip(self.weights_placeholders, self.weights)]
# locks for synchronous training
if self.network_is_global:
self._create_locks_for_synchronous_training()
# gradients ops
self._create_gradient_ops()
# L2 regularization
if self.network_parameters.l2_regularization != 0:
self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
* self.network_parameters.l2_regularization]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
self.inc_step = self.global_step.assign_add(1)
# reset LSTM hidden cells
self.reset_internal_memory()
if self.ap.visualization.tensorboard:
current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
scope=tf.contrib.framework.get_name_scope())
self.merged = tf.summary.merge(current_scope_summaries)
# initialize or restore model
self.init_op = tf.group(
tf.global_variables_initializer(),
tf.local_variables_initializer()
)
# set the fetches for training
self._set_initial_fetch_list()
def _set_initial_fetch_list(self):
"""
Create an initial list of tensors to fetch in each training iteration
:return: None
"""
self.train_fetches = [self.gradients_norm]
if self.network_parameters.clip_gradients:
self.train_fetches.append(self.clipped_grads)
else:
self.train_fetches.append(self.tensor_gradients)
self.train_fetches += [self.total_loss, self.losses]
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
self.train_fetches.append(self.middleware.state_out)
self.additional_fetches_start_idx = len(self.train_fetches)
def _create_locks_for_synchronous_training(self):
"""
Create locks for synchronizing the different workers during training
:return: None
"""
self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
self.lock = self.lock_counter.assign_add(1, use_locking=True)
self.lock_init = self.lock_counter.assign(0)
self.release_counter = tf.get_variable("release_counter", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
self.release = self.release_counter.assign_add(1, use_locking=True)
self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
self.release_init = self.release_counter.assign(0)
def _create_gradient_ops(self):
"""
Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
:return: None
"""
self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
self.gradients_norm = tf.global_norm(self.tensor_gradients)
# gradient clipping
if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
self._create_gradient_clipping_ops()
# when using a shared optimizer, we create accumulators to store gradients from all the workers before
# applying them
if self.distributed_training:
self._create_gradient_accumulators()
# gradients of the outputs w.r.t. the inputs
# at the moment, this is only used by ddpg
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
self.inputs.items()} for output in self.outputs]
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
for i in range(len(self.outputs))]
self.weighted_gradients = []
for i in range(len(self.outputs)):
unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
# unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
# self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
# unnormalized_gradients)))
self.weighted_gradients.append(unnormalized_gradients)
# defining the optimization process (for LBFGS we have less control over the optimizer)
if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
self._create_gradient_applying_ops()
def _create_gradient_accumulators(self):
if self.network_is_global:
self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
zip(self.weights_placeholders, self.shared_accumulated_gradients)]
self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
self.shared_accumulated_gradients]
elif self.network_is_local:
self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
def _create_gradient_clipping_ops(self):
"""
Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
:return: None
"""
if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
self.network_parameters.clip_gradients)
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
self.clipped_grads = [tf.clip_by_value(grad,
-self.network_parameters.clip_gradients,
self.network_parameters.clip_gradients)
for grad in self.tensor_gradients]
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
for grad in self.tensor_gradients]
def _create_gradient_applying_ops(self):
"""
Create tensorflow ops for applying the gradients to the network weights according to the training scheme
(distributed training - local or global network, shared optimizer, etc.)
:return: None
"""
if self.network_is_global and self.network_parameters.shared_optimizer and \
not self.network_parameters.async_training:
# synchronous training with shared optimizer? -> create an operation for applying the gradients
# accumulated in the shared gradients accumulator
self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
zip(self.shared_accumulated_gradients, self.weights),
global_step=self.global_step)
elif self.distributed_training and self.network_is_local:
# distributed training but independent optimizer? -> create an operation for applying the gradients
# to the global weights
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
elif self.network_is_trainable:
# not any of the above but is trainable? -> create an operation for applying the gradients to
# this network weights
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
zip(self.weights_placeholders, self.weights), global_step=self.global_step)
def set_session(self, sess):
self.sess = sess
task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
# initialize the session parameters in single threaded runs. Otherwise, this is done through the
# MonitoredSession object in the graph manager
if not task_is_distributed:
self.sess.run(self.init_op)
if self.ap.visualization.tensorboard:
# Write the merged summaries to the current experiment directory
if not task_is_distributed:
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
self.train_writer.add_graph(self.sess.graph)
elif self.network_is_local:
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
'/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
self.train_writer.add_graph(self.sess.graph)
# wait for all the workers to set their session
if not self.network_is_local:
self.wait_for_all_workers_barrier()
def reset_accumulated_gradients(self):
"""
Reset the gradients accumulation placeholder
"""
if self.accumulated_gradients is None:
self.accumulated_gradients = self.sess.run(self.weights)
for ix, grad in enumerate(self.accumulated_gradients):
self.accumulated_gradients[ix] = grad * 0
def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
no_accumulation=False):
"""
Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
placeholders
:param additional_fetches: Optional tensors to fetch during gradients calculation
:param inputs: The input batch for the network
:param targets: The targets corresponding to the input batch
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won't be scaled
:param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
replaced by the newely calculated gradients instead of accumulating the new gradients.
This can speed up the function runtime by around 10%.
:return: A list containing the total loss and the individual network heads losses
"""
if self.accumulated_gradients is None:
self.reset_accumulated_gradients()
# feed inputs
if additional_fetches is None:
additional_fetches = []
feed_dict = self.create_feed_dict(inputs)
# feed targets
targets = force_list(targets)
for placeholder_idx, target in enumerate(targets):
feed_dict[self.targets[placeholder_idx]] = target
# feed importance weights
importance_weights = force_list(importance_weights)
for placeholder_idx, target_ph in enumerate(targets):
if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
importance_weight = np.ones(target_ph.shape[0])
else:
importance_weight = importance_weights[placeholder_idx]
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
if self.optimizer_type != 'LBFGS':
# feed the lstm state if necessary
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
# we can't always assume that we are starting from scratch here can we?
feed_dict[self.middleware.c_in] = self.middleware.c_init
feed_dict[self.middleware.h_in] = self.middleware.h_init
fetches = self.train_fetches + additional_fetches
if self.ap.visualization.tensorboard:
fetches += [self.merged]
# get grads
result = self.sess.run(fetches, feed_dict=feed_dict)
if hasattr(self, 'train_writer') and self.train_writer is not None:
self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
# extract the fetches
norm_unclipped_grads, grads, total_loss, losses = result[:4]
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
fetched_tensors = []
if len(additional_fetches) > 0:
fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
len(additional_fetches)]
# accumulate the gradients
for idx, grad in enumerate(grads):
if no_accumulation:
self.accumulated_gradients[idx] = grad
else:
self.accumulated_gradients[idx] += grad
return total_loss, losses, norm_unclipped_grads, fetched_tensors
else:
self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
return [0]
def create_feed_dict(self, inputs):
feed_dict = {}
for input_name, input_value in inputs.items():
if isinstance(input_name, str):
if input_name not in self.inputs:
raise ValueError((
'input name {input_name} was provided to create a feed '
'dictionary, but there is no placeholder with that name. '
'placeholder names available include: {placeholder_names}'
).format(
input_name=input_name,
placeholder_names=', '.join(self.inputs.keys())
))
feed_dict[self.inputs[input_name]] = input_value
elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
feed_dict[input_name] = input_value
else:
raise ValueError((
'input dictionary expects strings or placeholders as keys, '
'but found key {key} of type {type}'
).format(
key=input_name,
type=type(input_name),
))
return feed_dict
def apply_and_reset_gradients(self, gradients, scaler=1.):
"""
Applies the given gradients to the network weights and resets the accumulation placeholder
:param gradients: The gradients to use for the update
:param scaler: A scaling factor that allows rescaling the gradients before applying them
"""
self.apply_gradients(gradients, scaler)
self.reset_accumulated_gradients()
def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
"""
Waits for all the workers to lock a certain lock and then continues
:param lock: the name of the lock to use
:param include_only_training_workers: wait only for training workers or for all the workers?
:return: None
"""
if include_only_training_workers:
num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
else:
num_workers_to_wait_for = self.ap.task_parameters.num_tasks
# lock
if hasattr(self, '{}_counter'.format(lock)):
self.sess.run(getattr(self, lock))
while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
time.sleep(0.00001)
# self.sess.run(getattr(self, '{}_init'.format(lock)))
else:
raise ValueError("no counter was defined for the lock {}".format(lock))
def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
"""
A barrier that allows waiting for all the workers to finish a certain block of commands
:param include_only_training_workers: wait only for training workers or for all the workers?
:return: None
"""
self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
self.sess.run(self.lock_init)
# we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
# and then was able to first increase the lock again by one, only to have a late worker to reset it again.
# so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
self.sess.run(self.release_init)
def apply_gradients(self, gradients, scaler=1.):
"""
Applies the given gradients to the network weights
:param gradients: The gradients to use for the update
:param scaler: A scaling factor that allows rescaling the gradients before applying them.
The gradients will be MULTIPLIED by this factor
"""
if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
if hasattr(self, 'global_step') and not self.network_is_local:
self.sess.run(self.inc_step)
if self.optimizer_type != 'LBFGS':
if self.distributed_training and not self.network_parameters.async_training:
# rescale the gradients so that they average out with the gradients from the other workers
if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
scaler /= float(self.ap.task_parameters.num_training_tasks)
# rescale the gradients
if scaler != 1.:
for gradient in gradients:
gradient *= scaler
# apply the gradients
feed_dict = dict(zip(self.weights_placeholders, gradients))
if self.distributed_training and self.network_parameters.shared_optimizer \
and not self.network_parameters.async_training:
# synchronous distributed training with shared optimizer:
# - each worker adds its gradients to the shared gradients accumulators
# - we wait for all the workers to add their gradients
# - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
self.wait_for_all_workers_barrier(include_only_training_workers=True)
if self.is_chief:
self.sess.run(self.update_weights_from_shared_gradients)
self.sess.run(self.init_shared_accumulated_gradients)
else:
# async distributed training / distributed training with independent optimizer
# / non-distributed training - just apply the gradients
feed_dict = dict(zip(self.weights_placeholders, gradients))
self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
# release barrier
if self.distributed_training and not self.network_parameters.async_training:
self.wait_for_all_workers_barrier(include_only_training_workers=True)
def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
"""
Run a forward pass of the network using the given input
:param inputs: The input for the network
:param outputs: The output for the network, defaults to self.outputs
:param squeeze_output: call squeeze_list on output
:param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
:return: The network output
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
"""
feed_dict = self.create_feed_dict(inputs)
if initial_feed_dict:
feed_dict.update(initial_feed_dict)
if outputs is None:
outputs = self.outputs
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
feed_dict=feed_dict)
else:
output = self.sess.run(outputs, feed_dict)
if squeeze_output:
output = squeeze_list(output)
return output
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
"""
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
:param additional_fetches: Optional tensors to fetch during the training process
:param inputs: The input for the network
:param targets: The targets corresponding to the input batch
:param scaler: A scaling factor that allows rescaling the gradients before applying them
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won't be scaled
:return: The loss of the network
"""
if additional_fetches is None:
additional_fetches = []
force_list(additional_fetches)
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
importance_weights=importance_weights)
self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
return loss
def get_weights(self):
"""
:return: a list of tensors containing the network weights for each layer
"""
return self.weights
def set_weights(self, weights, new_rate=1.0):
"""
Sets the network weights from the given list of weights tensors
"""
feed_dict = {}
old_weights, new_weights = self.sess.run([self.get_weights(), weights])
for placeholder_idx, new_weight in enumerate(new_weights):
feed_dict[self.weights_placeholders[placeholder_idx]]\
= new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
self.sess.run(self.update_weights_from_list, feed_dict)
def get_variable_value(self, variable):
"""
Get the value of a variable from the graph
:param variable: the variable
:return: the value of the variable
"""
return self.sess.run(variable)
def set_variable_value(self, assign_op, value, placeholder=None):
"""
Updates the value of a variable.
This requires having an assign operation for the variable, and a placeholder which will provide the value
:param assign_op: an assign operation for the variable
:param value: a value to set the variable to
:param placeholder: a placeholder to hold the given value for injecting it into the variable
"""
self.sess.run(assign_op, feed_dict={placeholder: value})
def reset_internal_memory(self):
"""
Reset any internal memory used by the network. For example, an LSTM internal state
:return: None
"""
# initialize LSTM hidden states
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
self.curr_rnn_c_in = self.middleware.c_init
self.curr_rnn_h_in = self.middleware.h_init

View File

@@ -0,0 +1,102 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Tuple
import tensorflow as tf
def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
"""
Creates a ClusterSpec object representing the cluster.
:param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
:param workers: comma-separated list of hostname:port pairs to which the workers are assigned
:return: a ClusterSpec object representing the cluster
"""
# extract the parameter servers and workers from the given strings
ps_hosts = parameters_server.split(",")
worker_hosts = workers.split(",")
# Create a cluster spec from the parameter server and worker hosts
cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
return cluster_spec
def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
"""
Create and start a parameter server
:param cluster_spec: the ClusterSpec object representing the cluster
:param config: the tensorflow config to use
:return: None
"""
# create a server object for the parameter server
server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
# wait for the server to finish
server.join()
def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
"""
Creates a worker server and a device setter used to assign the workers operations to
:param cluster_spec: a ClusterSpec object representing the cluster
:param task_index: the index of the worker task
:param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
:param config: the tensorflow config to use
:return: the target string for the tf.Session and the worker device setter object
"""
# Create and start a worker
server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
# Assign ops to the local worker
worker_device = "/job:worker/task:{}".format(task_index)
if use_cpu:
worker_device += "/cpu:0"
else:
worker_device += "/device:GPU:0"
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
return server.target, device
def create_monitored_session(target: tf.train.Server, task_index: int,
checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
"""
Create a monitored session for the worker
:param target: the target string for the tf.Session
:param task_index: the task index of the worker
:param checkpoint_dir: a directory path where the checkpoints will be stored
:param save_checkpoint_secs: number of seconds between checkpoints storing
:param config: the tensorflow configuration (optional)
:return: the session to use for the run
"""
# we chose the first task to be the chief
is_chief = task_index == 0
# Create the monitored session
sess = tf.train.MonitoredTrainingSession(
master=target,
is_chief=is_chief,
hooks=[],
checkpoint_dir=checkpoint_dir,
save_checkpoint_secs=save_checkpoint_secs,
config=config
)
return sess

View File

@@ -0,0 +1,114 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List, Union
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.core_types import InputEmbedding
class InputEmbedder(object):
"""
An input embedder is the first part of the network, which takes the input from the state and produces a vector
embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
can be multiple embedders in a single network
"""
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
self.name = name
self.input_size = input_size
self.activation_function = activation_function
self.batchnorm = batchnorm
self.dropout = dropout
self.dropout_rate = 0
self.input = None
self.output = None
self.scheme = scheme
self.return_type = InputEmbedding
self.layers = []
self.input_rescaling = input_rescaling
self.input_offset = input_offset
self.input_clipping = input_clipping
def __call__(self, prev_input_placeholder=None):
with tf.variable_scope(self.get_name()):
if prev_input_placeholder is None:
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
else:
self.input = prev_input_placeholder
self._build_module()
return self.input, self.output
def _build_module(self):
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
# input to the network to be float, which is 4x more expensive in memory.
# thus causing each saved transition in the memory to also be 4x more pricier.
input_layer = self.input / self.input_rescaling
input_layer -= self.input_offset
# clip input using te given range
if self.input_clipping is not None:
input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
self.layers.append(input_layer)
# layers order is conv -> batchnorm -> activation -> dropout
if isinstance(self.scheme, EmbedderScheme):
layers_params = self.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
self.output = tf.contrib.layers.flatten(self.layers[-1])
@property
def input_size(self) -> List[int]:
return self._input_size
@input_size.setter
def input_size(self, value: Union[int, List[int]]):
if isinstance(value, np.ndarray) or isinstance(value, tuple):
value = list(value)
elif isinstance(value, int):
value = [value]
if not isinstance(value, list):
raise ValueError((
'input_size expected to be a list, found {value} which has type {type}'
).format(value=value, type=type(value)))
self._input_size = value
@property
def schemes(self):
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
"configurations.")
def get_name(self):
return self.name

View File

@@ -0,0 +1,74 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import Conv2d
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
from rl_coach.core_types import InputImageEmbedding
class ImageEmbedder(InputEmbedder):
"""
An input embedder that performs convolutions on the input and then flattens the result.
The embedder is intended for image like inputs, where the channels are expected to be the last axis.
The embedder also allows custom rescaling of the input prior to the neural network.
"""
schemes = {
EmbedderScheme.Empty:
[],
EmbedderScheme.Shallow:
[
Conv2d([32, 3, 1])
],
# atari dqn
EmbedderScheme.Medium:
[
Conv2d([32, 8, 4]),
Conv2d([64, 4, 2]),
Conv2d([64, 3, 1])
],
# carla
EmbedderScheme.Deep: \
[
Conv2d([32, 5, 2]),
Conv2d([32, 3, 1]),
Conv2d([64, 3, 2]),
Conv2d([64, 3, 1]),
Conv2d([128, 3, 2]),
Conv2d([128, 3, 1]),
Conv2d([256, 3, 2]),
Conv2d([256, 3, 1])
]
}
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
input_offset, input_clipping)
self.return_type = InputImageEmbedding
if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
.format(input_size))

View File

@@ -0,0 +1,64 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import Dense
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
from rl_coach.core_types import InputVectorEmbedding
class VectorEmbedder(InputEmbedder):
"""
An input embedder that is intended for inputs that can be represented as vectors.
The embedder flattens the input, applies several dense layers to it and returns the output.
"""
schemes = {
EmbedderScheme.Empty:
[],
EmbedderScheme.Shallow:
[
Dense([128])
],
# dqn
EmbedderScheme.Medium:
[
Dense([256])
],
# carla
EmbedderScheme.Deep: \
[
Dense([128]),
Dense([128]),
Dense([128])
]
}
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
input_rescaling, input_offset, input_clipping)
self.return_type = InputVectorEmbedding
if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
raise ValueError("The input size of a vector embedder must contain only a single dimension")

View File

@@ -0,0 +1,344 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from typing import Dict
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
from rl_coach.core_types import PredictionType
class GeneralTensorFlowNetwork(TensorFlowArchitecture):
"""
A generalized version of all possible networks implemented using tensorflow.
"""
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
"""
:param agent_parameters: the agent parameters
:param spaces: the spaces definition of the agent
:param name: the name of the network
:param global_network: the global network replica that is shared between all the workers
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
:param network_is_trainable: is the network trainable (we can apply gradients on it)
"""
self.global_network = global_network
self.network_is_local = network_is_local
self.network_wrapper_name = name.split('/')[0]
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
len(self.network_parameters.heads_parameters)
self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
len(self.network_parameters.heads_parameters)
self.gradients_from_head_rescalers = []
self.gradients_from_head_rescalers_placeholders = []
self.update_head_rescaler_value_ops = []
self.adaptive_learning_rate_scheme = None
self.current_learning_rate = None
# init network modules containers
self.input_embedders = []
self.output_heads = []
super().__init__(agent_parameters, spaces, name, global_network,
network_is_local, network_is_trainable)
def fill_return_types():
ret_dict = {}
for cls in get_all_subclasses(PredictionType):
ret_dict[cls] = []
components = self.input_embedders + [self.middleware] + self.output_heads
for component in components:
if not hasattr(component, 'return_type'):
raise ValueError("{} has no return_type attribute. This should not happen.")
if component.return_type is not None:
ret_dict[component.return_type].append(component)
return ret_dict
self.available_return_types = fill_return_types()
def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
prediction_type: PredictionType) -> Dict[str, np.ndarray]:
"""
Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
predictions for it.
:param states: The input states to the network.
:param prediction_type: The requested PredictionType to look for in the network components
:return: A dictionary with predictions for all components matching the requested prediction type
"""
ret_dict = {}
for component in self.available_return_types[prediction_type]:
ret_dict[component] = self.predict(inputs=states, outputs=component.output)
return ret_dict
@staticmethod
def get_activation_function(activation_function_string: str):
"""
Map the activation function from a string to the tensorflow framework equivalent
:param activation_function_string: the type of the activation function
:return: the tensorflow activation function
"""
activation_functions = {
'relu': tf.nn.relu,
'tanh': tf.nn.tanh,
'sigmoid': tf.nn.sigmoid,
'elu': tf.nn.elu,
'selu': tf.nn.selu,
'leaky_relu': tf.nn.leaky_relu,
'none': None
}
assert activation_function_string in activation_functions.keys(), \
"Activation function must be one of the following {}. instead it was: {}"\
.format(activation_functions.keys(), activation_function_string)
return activation_functions[activation_function_string]
def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
"""
Given an input embedder parameters class, creates the input embedder and returns it
:param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
be a value within the state or the action.
:param embedder_params: the parameters of the class of the embedder
:return: the embedder instance
"""
allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
allowed_inputs["action"] = copy.copy(self.spaces.action)
allowed_inputs["goal"] = copy.copy(self.spaces.goal)
if input_name not in allowed_inputs.keys():
raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
.format(input_name, allowed_inputs.keys()))
type = "vector"
if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
type = "image"
embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
embedder_params_copy = copy.copy(embedder_params)
embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
embedder_params_copy.name = input_name
module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
path=embedder_path,
positional_args=[allowed_inputs[input_name].shape])
return module
def get_middleware(self, middleware_params: MiddlewareParameters):
"""
Given a middleware type, creates the middleware and returns it
:param middleware_params: the paramaeters of the middleware class
:return: the middleware instance
"""
middleware_params_copy = copy.copy(middleware_params)
middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
return module
def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
"""
Given a head type, creates the head and returns it
:param head_params: the parameters of the head to create
:param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
the path should be in the following structure: <module_path>:<class_path>
:param head_idx: the head index
:param loss_weight: the weight to assign for the embedders loss
:return: the head
"""
head_params_copy = copy.copy(head_params)
head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
def get_model(self):
# validate the configuration
if len(self.network_parameters.input_embedders_parameters) == 0:
raise ValueError("At least one input type should be defined")
if len(self.network_parameters.heads_parameters) == 0:
raise ValueError("At least one output type should be defined")
if self.network_parameters.middleware_parameters is None:
raise ValueError("Exactly one middleware type should be defined")
if len(self.network_parameters.loss_weights) == 0:
raise ValueError("At least one loss weight should be defined")
if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
raise ValueError("Number of loss weights should match the number of output types")
for network_idx in range(self.num_networks):
with tf.variable_scope('network_{}'.format(network_idx)):
####################
# Input Embeddings #
####################
state_embedding = []
for input_name in sorted(self.network_parameters.input_embedders_parameters):
input_type = self.network_parameters.input_embedders_parameters[input_name]
# get the class of the input embedder
input_embedder = self.get_input_embedder(input_name, input_type)
self.input_embedders.append(input_embedder)
# input placeholders are reused between networks. on the first network, store the placeholders
# generated by the input_embedders in self.inputs. on the rest of the networks, pass
# the existing input_placeholders into the input_embedders.
if network_idx == 0:
input_placeholder, embedding = input_embedder()
self.inputs[input_name] = input_placeholder
else:
input_placeholder, embedding = input_embedder(self.inputs[input_name])
state_embedding.append(embedding)
##########
# Merger #
##########
if len(state_embedding) == 1:
state_embedding = state_embedding[0]
else:
if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
state_embedding = tf.add_n(state_embedding, name="merger")
##############
# Middleware #
##############
self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
_, self.state_embedding = self.middleware(state_embedding)
################
# Output Heads #
################
head_count = 0
for head_idx in range(self.num_heads_per_network):
for head_copy_idx in range(self.network_parameters.num_output_head_copies):
if self.network_parameters.use_separate_networks_per_head:
# if we use separate networks per head, then the head type corresponds top the network idx
head_type_idx = network_idx
head_count = network_idx
else:
# if we use a single network with multiple embedders, then the head type is the current head idx
head_type_idx = head_idx
self.output_heads.append(
self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
head_copy_idx,
self.network_parameters.loss_weights[head_type_idx])
)
# rescale the gradients from the head
self.gradients_from_head_rescalers.append(
tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
initializer=float(
self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
),
dtype=tf.float32))
self.gradients_from_head_rescalers_placeholders.append(
tf.placeholder('float',
name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
self.gradients_from_head_rescalers_placeholders[head_count]))
head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
self.gradients_from_head_rescalers[head_count] * self.state_embedding
# build the head
if self.network_is_local:
output, target_placeholder, input_placeholders, importance_weight_ph = \
self.output_heads[-1](head_input)
self.targets.extend(target_placeholder)
self.importance_weights.extend(importance_weight_ph)
else:
output, input_placeholders = self.output_heads[-1](head_input)
self.outputs.extend(output)
# TODO: use head names as well
for placeholder_index, input_placeholder in enumerate(input_placeholders):
self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
head_count += 1
# Losses
self.losses = tf.losses.get_losses(self.full_name)
self.losses += tf.losses.get_regularization_losses(self.full_name)
self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
# tf.summary.scalar('total_loss', self.total_loss)
# Learning rate
if self.network_parameters.learning_rate_decay_rate != 0:
self.adaptive_learning_rate_scheme = \
tf.train.exponential_decay(
self.network_parameters.learning_rate,
self.global_step,
decay_steps=self.network_parameters.learning_rate_decay_steps,
decay_rate=self.network_parameters.learning_rate_decay_rate,
staircase=True)
self.current_learning_rate = self.adaptive_learning_rate_scheme
else:
self.current_learning_rate = self.network_parameters.learning_rate
# Optimizer
if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
# distributed training + is a local network + optimizer shared -> take the global optimizer
self.optimizer = self.global_network.optimizer
elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
or self.network_parameters.shared_optimizer or not self.distributed_training:
# distributed training + is a global network + optimizer shared
# OR
# distributed training + is a local network + optimizer not shared
# OR
# non-distributed training
# -> create an optimizer
if self.network_parameters.optimizer_type == 'Adam':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
beta1=self.network_parameters.adam_optimizer_beta1,
beta2=self.network_parameters.adam_optimizer_beta2,
epsilon=self.network_parameters.optimizer_epsilon)
elif self.network_parameters.optimizer_type == 'RMSProp':
self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
decay=self.network_parameters.rms_prop_optimizer_decay,
epsilon=self.network_parameters.optimizer_epsilon)
elif self.network_parameters.optimizer_type == 'LBFGS':
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
options={'maxiter': 25})
else:
raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))

View File

@@ -0,0 +1,54 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class CategoricalQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
class CategoricalQHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'categorical_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms
self.return_type = QActionStateValue
def _build_module(self, input_layer):
self.actions = tf.placeholder(tf.int32, [None], name="actions")
self.input = [self.actions]
values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
self.num_atoms))
# softmax on atoms dimension
self.output = tf.nn.softmax(values_distribution)
# calculate cross entropy loss
self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
name="distributions")
self.target = self.distributions
self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,66 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.core_types import ActionProbabilities
class DDPGActorHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
self.batchnorm = batchnorm
class DDPGActor(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
batchnorm: bool=True):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ddpg_actor_head'
self.return_type = ActionProbabilities
self.num_actions = self.spaces.action.shape
self.batchnorm = batchnorm
# bounded actions
self.output_scale = self.spaces.action.max_abs_range
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
if hasattr(agent_parameters.algorithm, 'action_penalty'):
self.action_penalty = agent_parameters.algorithm.action_penalty
def _build_module(self, input_layer):
# mean
pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
self.activation_function,
False, 0, 0)[-1]
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
if self.is_local:
# add a squared penalty on the squared pre-activation features of the action
if self.action_penalty and self.action_penalty != 0:
self.regularizations += \
[self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
self.output = [self.policy_mean]

View File

@@ -0,0 +1,87 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
from rl_coach.spaces import SpacesDefinition
from rl_coach.memories.non_episodic import differentiable_neural_dictionary
class DNDQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
class DNDQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'dnd_q_values_head'
self.DND_size = agent_parameters.algorithm.dnd_size
self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
self.number_of_nn = agent_parameters.algorithm.number_of_knn
self.ap = agent_parameters
self.dnd_embeddings = [None] * self.num_actions
self.dnd_values = [None] * self.num_actions
self.dnd_indices = [None] * self.num_actions
self.dnd_distances = [None] * self.num_actions
if self.ap.memory.shared_memory:
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
def _build_module(self, input_layer):
if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
else:
self.DND = differentiable_neural_dictionary.QDND(
self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
key_error_threshold=self.DND_key_error_threshold,
learning_rate=self.network_parameters.learning_rate,
num_neighbors=self.number_of_nn,
override_existing_keys=True)
# Retrieve info from DND dictionary
# We assume that all actions have enough entries in the DND
self.output = tf.transpose([
self._q_value(input_layer, action)
for action in range(self.num_actions)
])
def _q_value(self, input_layer, action):
result = tf.py_func(self.DND.query,
[input_layer, action, self.number_of_nn],
[tf.float64, tf.float64, tf.int64])
self.dnd_embeddings[action] = tf.to_float(result[0])
self.dnd_values[action] = tf.to_float(result[1])
self.dnd_indices[action] = result[2]
# DND calculation
square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
self.dnd_distances[action] = distances
weights = 1.0 / distances
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
q_value.set_shape((None,))
return q_value
def _post_build(self):
# DND gradients
self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)

View File

@@ -0,0 +1,50 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
from rl_coach.spaces import SpacesDefinition
class DuelingQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
class DuelingQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'dueling_q_values_head'
def _build_module(self, input_layer):
# state value tower - V
with tf.variable_scope("state_value"):
state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
state_value = tf.layers.dense(state_value, 1, name='fc2')
# state_value = tf.expand_dims(state_value, axis=-1)
# action advantage tower - A
with tf.variable_scope("action_advantage"):
action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
action_advantage = action_advantage - tf.reduce_mean(action_advantage)
# merge to state-action value function Q
self.output = tf.add(state_value, action_advantage, name='output')

View File

@@ -0,0 +1,165 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Type
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters, Parameters
from rl_coach.spaces import SpacesDefinition
from tensorflow.python.ops.losses.losses_impl import Reduction
from rl_coach.utils import force_list
# Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
def _initializer(shape, dtype=None, partition_info=None):
out = np.random.randn(*shape).astype(np.float32)
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
return tf.constant(out)
return _initializer
class HeadParameters(Parameters):
def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
super().__init__()
self.activation_function = activation_function
self.name = name
self.parameterized_class_name = parameterized_class.__name__
class Head(object):
"""
A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
an assigned loss function. The heads are algorithm dependent.
"""
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
self.head_idx = head_idx
self.network_name = network_name
self.network_parameters = agent_parameters.network_wrappers[self.network_name]
self.name = "head"
self.output = []
self.loss = []
self.loss_type = []
self.regularizations = []
self.loss_weight = force_list(loss_weight)
self.target = []
self.importance_weight = []
self.input = []
self.is_local = is_local
self.ap = agent_parameters
self.spaces = spaces
self.return_type = None
self.activation_function = activation_function
def __call__(self, input_layer):
"""
Wrapper for building the module graph including scoping and loss creation
:param input_layer: the input to the graph
:return: the output of the last layer and the target placeholder
"""
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
self._build_module(input_layer)
self.output = force_list(self.output)
self.target = force_list(self.target)
self.input = force_list(self.input)
self.loss_type = force_list(self.loss_type)
self.loss = force_list(self.loss)
self.regularizations = force_list(self.regularizations)
if self.is_local:
self.set_loss()
self._post_build()
if self.is_local:
return self.output, self.target, self.input, self.importance_weight
else:
return self.output, self.input
def _build_module(self, input_layer):
"""
Builds the graph of the module
This method is called early on from __call__. It is expected to store the graph
in self.output.
:param input_layer: the input to the graph
:return: None
"""
pass
def _post_build(self):
"""
Optional function that allows adding any extra definitions after the head has been fully defined
For example, this allows doing additional calculations that are based on the loss
:return: None
"""
pass
def get_name(self):
"""
Get a formatted name for the module
:return: the formatted name
"""
return '{}_{}'.format(self.name, self.head_idx)
def set_loss(self):
"""
Creates a target placeholder and loss function for each loss_type and regularization
:param loss_type: a tensorflow loss function
:param scope: the name scope to include the tensors in
:return: None
"""
# there are heads that define the loss internally, but we need to create additional placeholders for them
for idx in range(len(self.loss)):
importance_weight = tf.placeholder('float',
[None] + [1] * (len(self.target[idx].shape) - 1),
'{}_importance_weight'.format(self.get_name()))
self.importance_weight.append(importance_weight)
# add losses and target placeholder
for idx in range(len(self.loss_type)):
# create target placeholder
target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
self.target.append(target)
# create importance sampling weights placeholder
num_target_dims = len(self.target[idx].shape)
importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
'{}_importance_weight'.format(self.get_name()))
self.importance_weight.append(importance_weight)
# compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
# weights the specific loss of this head against other losses in this head or in other heads
loss_weight = self.loss_weight[idx]*importance_weight
loss = self.loss_type[idx](self.target[-1], self.output[idx],
scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
# the loss is first summed over each sample in the batch and then the mean over the batch is taken
loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
# we add the loss to the losses collection and later we will extract it in general_network
tf.losses.add_loss(loss)
self.loss.append(loss)
# add regularizations
for regularization in self.regularizations:
self.loss.append(regularization)
@classmethod
def path(cls):
return cls.__class__.__name__

View File

@@ -0,0 +1,65 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import Measurements
class MeasurementsPredictionHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
super().__init__(parameterized_class=MeasurementsPredictionHead,
activation_function=activation_function, name=name)
class MeasurementsPredictionHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'future_measurements_head'
self.num_actions = len(self.spaces.action.actions)
self.num_measurements = self.spaces.state['measurements'].shape[0]
self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
self.return_type = Measurements
def _build_module(self, input_layer):
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
# actions expectation tower (expectation stream) - E
with tf.variable_scope("expectation_stream"):
expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
expectation_stream = tf.expand_dims(expectation_stream, axis=1)
# action fine differences tower (action stream) - A
with tf.variable_scope("action_stream"):
action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
name='output')
action_stream = tf.reshape(action_stream,
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
# merge to future measurements predictions
self.output = tf.add(expectation_stream, action_stream, name='output')
self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
name="targets")
targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
tf.losses.add_loss(self.loss_weight[0] * self.loss)

View File

@@ -0,0 +1,88 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import BoxActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class NAFHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
class NAFHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
if not isinstance(self.spaces.action, BoxActionSpace):
raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
self.name = 'naf_q_values_head'
self.num_actions = self.spaces.action.shape[0]
self.output_scale = self.spaces.action.max_abs_range
self.return_type = QActionStateValue
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# NAF
self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
self.input = self.action
# V Head
self.V = tf.layers.dense(input_layer, 1, name='V')
# mu Head
mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
# A Head
# l_vector is a vector that includes a lower-triangular matrix values
self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
# Convert l to a lower triangular matrix and exponentiate its diagonal
i = 0
columns = []
for col in range(self.num_actions):
start_row = col
num_non_zero_elements = self.num_actions - start_row
zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
i += num_non_zero_elements
self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
# P = L*L^T
self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
# A = -1/2 * (u - mu)^T * P * (u - mu)
action_diff = tf.expand_dims(self.action - self.mu, -1)
a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
self.A = tf.reshape(a_matrix_form, [-1, 1])
# Q Head
self.Q = tf.add(self.V, self.A, name='Q')
self.output = self.Q

View File

@@ -0,0 +1,151 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
from rl_coach.core_types import ActionProbabilities
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
class PolicyHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
class PolicyHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'policy_values_head'
self.return_type = ActionProbabilities
self.beta = None
self.action_penalty = None
self.exploration_policy = agent_parameters.exploration
# a scalar weight that penalizes low entropy values to encourage exploration
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
self.beta = agent_parameters.algorithm.beta_entropy
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
if hasattr(agent_parameters.algorithm, 'action_penalty'):
self.action_penalty = agent_parameters.algorithm.action_penalty
def _build_module(self, input_layer):
self.actions = []
self.input = self.actions
self.policy_distributions = []
self.output = []
action_spaces = [self.spaces.action]
if isinstance(self.spaces.action, CompoundActionSpace):
action_spaces = self.spaces.action.sub_action_spaces
# create a compound action network
for action_space_idx, action_space in enumerate(action_spaces):
with tf.variable_scope("sub_action_{}".format(action_space_idx)):
if isinstance(action_space, DiscreteActionSpace):
# create a discrete action network (softmax probabilities output)
self._build_discrete_net(input_layer, action_space)
elif isinstance(action_space, BoxActionSpace):
# create a continuous action network (bounded mean and stdev outputs)
self._build_continuous_net(input_layer, action_space)
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# calculate loss
self.action_log_probs_wrt_policy = \
tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.target = self.advantages
self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
tf.losses.add_loss(self.loss_weight[0] * self.loss)
def _build_discrete_net(self, input_layer, action_space):
num_actions = len(action_space.actions)
self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
self.policy_probs = tf.nn.softmax(policy_values, name="policy")
# define the distributions for the policy and the old policy
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
self.policy_distributions.append(policy_distribution)
self.output.append(self.policy_probs)
def _build_continuous_net(self, input_layer, action_space):
num_actions = action_space.shape
self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
# output activation function
if np.all(self.spaces.action.max_abs_range < np.inf):
# bounded actions
self.output_scale = action_space.max_abs_range
self.continuous_output_activation = self.activation_function
else:
# unbounded actions
self.output_scale = 1
self.continuous_output_activation = None
# mean
pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
self.output.append(self.policy_mean)
# standard deviation
if isinstance(self.exploration_policy, ContinuousEntropyParameters):
# the stdev is an output of the network and uses a softplus activation as defined in A3C
policy_values_std = tf.layers.dense(input_layer, num_actions,
kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
self.output.append(self.policy_std)
else:
# the stdev is an externally given value
# Warning: we need to explicitly put this variable in the local variables collections, since defining
# it as not trainable puts it for some reason in the global variables collections. If this is not done,
# the variable won't be initialized and when working with multiple workers they will get stuck.
self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
# assign op for the policy std
self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
# define the distributions for the policy and the old policy
policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
self.policy_distributions.append(policy_distribution)
if self.is_local:
# add a squared penalty on the squared pre-activation features of the action
if self.action_penalty and self.action_penalty != 0:
self.regularizations += [
self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]

View File

@@ -0,0 +1,144 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
from rl_coach.core_types import ActionProbabilities
class PPOHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
class PPOHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ppo_head'
self.return_type = ActionProbabilities
# used in regular PPO
self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
if self.use_kl_regularization:
# kl coefficient and its corresponding assignment operation and placeholder
self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
trainable=False, name='kl_coefficient')
self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
self.beta = agent_parameters.algorithm.beta_entropy
def _build_module(self, input_layer):
if isinstance(self.spaces.action, DiscreteActionSpace):
self._build_discrete_net(input_layer, self.spaces.action)
elif isinstance(self.spaces.action, BoxActionSpace):
self._build_continuous_net(input_layer, self.spaces.action)
else:
raise ValueError("only discrete or continuous action spaces are supported for PPO")
self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
# Used by regular PPO only
# add kl divergence regularization
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
if self.use_kl_regularization:
# no clipping => use kl regularization
self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# calculate surrogate loss
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.target = self.advantages
# action_probs_wrt_old_policy != 0 because it is e^...
self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
if self.clip_likelihood_ratio_using_epsilon is not None:
self.clip_param_rescaler = tf.placeholder(tf.float32, ())
self.input.append(self.clip_param_rescaler)
max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
self.clipped_likelihood_ratio * self.advantages)
else:
self.scaled_advantages = self.likelihood_ratio * self.advantages
# minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
self.loss = self.surrogate_loss
tf.losses.add_loss(self.loss)
def _build_discrete_net(self, input_layer, action_space):
num_actions = len(action_space.actions)
self.actions = tf.placeholder(tf.int32, [None], name="actions")
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
# Policy Head
self.input = [self.actions, self.old_policy_mean]
policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
self.policy_mean = tf.nn.softmax(policy_values, name="policy")
# define the distributions for the policy and the old policy
self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
self.output = self.policy_mean
def _build_continuous_net(self, input_layer, action_space):
num_actions = action_space.shape[0]
self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
kernel_initializer=normalized_columns_initializer(0.01))
if self.is_local:
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
collections=[tf.GraphKeys.LOCAL_VARIABLES])
else:
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
# define the distributions for the policy and the old policy
self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
self.output = [self.policy_mean, self.policy_std]

View File

@@ -0,0 +1,52 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.core_types import ActionProbabilities
class PPOVHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
class PPOVHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ppo_v_head'
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
self.return_type = ActionProbabilities
def _build_module(self, input_layer):
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
self.input = [self.old_policy_value]
self.output = tf.layers.dense(input_layer, 1, name='output',
kernel_initializer=normalized_columns_initializer(1.0))
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
value_loss_1 = tf.square(self.output - self.target)
value_loss_2 = tf.square(self.old_policy_value +
tf.clip_by_value(self.output - self.old_policy_value,
-self.clip_likelihood_ratio_using_epsilon,
self.clip_likelihood_ratio_using_epsilon) - self.target)
self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
self.loss = self.vf_loss
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,50 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class QHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
class QHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'q_values_head'
if isinstance(self.spaces.action, BoxActionSpace):
self.num_actions = 1
elif isinstance(self.spaces.action, DiscreteActionSpace):
self.num_actions = len(self.spaces.action.actions)
self.return_type = QActionStateValue
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# Standard Q Network
self.output = tf.layers.dense(input_layer, self.num_actions, name='output')

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class QuantileRegressionQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
name=name)
class QuantileRegressionQHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'quantile_regression_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval # k
self.return_type = QActionStateValue
def _build_module(self, input_layer):
self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
self.input = [self.actions, self.quantile_midpoints]
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
self.output = quantiles_locations
self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
self.target = self.quantiles
# only the quantiles of the taken action are taken into account
quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
# reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
# the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
# the target quantiles vector is tiled as column of a NxN matrix
theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
# Huber loss of T(theta_j) - theta_i
error = T_theta_j - theta_i
abs_error = tf.abs(error)
quadratic = tf.minimum(abs_error, self.huber_loss_interval)
huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
# Quantile Huber loss
quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
# Quantile regression loss (the probability for each quantile is 1/num_quantiles)
quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
self.loss = quantile_regression_loss
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,45 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.core_types import VStateValue
class VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
class VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'v_values_head'
self.return_type = VStateValue
if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# Standard V Network
self.output = tf.layers.dense(input_layer, 1, name='output',
kernel_initializer=normalized_columns_initializer(1.0))

View File

@@ -0,0 +1,86 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union, List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
from rl_coach.base_parameters import MiddlewareScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
from rl_coach.core_types import Middleware_FC_Embedding
class FCMiddlewareParameters(MiddlewareParameters):
def __init__(self, activation_function='relu',
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_fc_embedder"):
super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
class FCMiddleware(Middleware):
schemes = {
MiddlewareScheme.Empty:
[],
# ppo
MiddlewareScheme.Shallow:
[
Dense([64])
],
# dqn
MiddlewareScheme.Medium:
[
Dense([512])
],
MiddlewareScheme.Deep: \
[
Dense([128]),
Dense([128]),
Dense([128])
]
}
def __init__(self, activation_function=tf.nn.relu,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_fc_embedder"):
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
dropout=dropout, scheme=scheme, name=name)
self.return_type = Middleware_FC_Embedding
self.layers = []
def _build_module(self):
self.layers.append(self.input)
if isinstance(self.scheme, MiddlewareScheme):
layers_params = FCMiddleware.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
self.output = self.layers[-1]

View File

@@ -0,0 +1,113 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
from rl_coach.base_parameters import MiddlewareScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.core_types import Middleware_LSTM_Embedding
class LSTMMiddlewareParameters(MiddlewareParameters):
def __init__(self, activation_function='relu', number_of_lstm_cells=256,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_lstm_embedder"):
super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
self.number_of_lstm_cells = number_of_lstm_cells
class LSTMMiddleware(Middleware):
schemes = {
MiddlewareScheme.Empty:
[],
# ppo
MiddlewareScheme.Shallow:
[
[64]
],
# dqn
MiddlewareScheme.Medium:
[
[512]
],
MiddlewareScheme.Deep: \
[
[128],
[128],
[128]
]
}
def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_lstm_embedder"):
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
dropout=dropout, scheme=scheme, name=name)
self.return_type = Middleware_LSTM_Embedding
self.number_of_lstm_cells = number_of_lstm_cells
self.layers = []
def _build_module(self):
"""
self.state_in: tuple of placeholders containing the initial state
self.state_out: tuple of output state
todo: it appears that the shape of the output is batch, feature
the code here seems to be slicing off the first element in the batch
which would definitely be wrong. need to double check the shape
"""
self.layers.append(self.input)
# optionally insert some dense layers before the LSTM
if isinstance(self.scheme, MiddlewareScheme):
layers_params = LSTMMiddleware.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
# add the LSTM layer
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
self.state_init = [self.c_init, self.h_init]
self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
self.state_in = (self.c_in, self.h_in)
rnn_in = tf.expand_dims(self.layers[-1], [0])
step_size = tf.shape(self.layers[-1])[:1]
state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
lstm_c, lstm_h = lstm_state
self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])

View File

@@ -0,0 +1,68 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Type, Union, List
import tensorflow as tf
from rl_coach.base_parameters import MiddlewareScheme, Parameters
from rl_coach.core_types import MiddlewareEmbedding
class MiddlewareParameters(Parameters):
def __init__(self, parameterized_class: Type['Middleware'],
activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
batchnorm: bool=False, dropout: bool=False,
name='middleware'):
super().__init__()
self.activation_function = activation_function
self.scheme = scheme
self.batchnorm = batchnorm
self.dropout = dropout
self.name = name
self.parameterized_class_name = parameterized_class.__name__
class Middleware(object):
"""
A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
after they were aggregated in some method (for example, concatenation) and passes it through a neural network
which can be customizable but shared between the heads of the network
"""
def __init__(self, activation_function=tf.nn.relu,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
self.name = name
self.input = None
self.output = None
self.activation_function = activation_function
self.batchnorm = batchnorm
self.dropout = dropout
self.dropout_rate = 0
self.scheme = scheme
self.return_type = MiddlewareEmbedding
def __call__(self, input_layer):
with tf.variable_scope(self.get_name()):
self.input = input_layer
self._build_module()
return self.input, self.output
def _build_module(self):
pass
def get_name(self):
return self.name

View File

@@ -0,0 +1,121 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
class SharedRunningStats(object):
def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
self.sess = None
self.name = name
self.replicated_device = replicated_device
self.epsilon = epsilon
self.ops_were_created = False
if create_ops:
with tf.device(replicated_device):
self.create_ops()
def create_ops(self, shape=[1], clip_values=None):
self.clip_values = clip_values
with tf.variable_scope(self.name):
self._sum = tf.get_variable(
dtype=tf.float64,
initializer=tf.constant_initializer(0.0),
name="running_sum", trainable=False, shape=shape, validate_shape=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._sum_squared = tf.get_variable(
dtype=tf.float64,
initializer=tf.constant_initializer(self.epsilon),
name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._count = tf.get_variable(
dtype=tf.float64,
shape=(),
initializer=tf.constant_initializer(self.epsilon),
name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._shape = None
self._mean = tf.div(self._sum, self._count, name="mean")
self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
/ tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
self.tf_mean = tf.cast(self._mean, 'float32')
self.tf_std = tf.cast(self._std, 'float32')
self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
self.normalized_obs = (self.raw_obs - self._mean) / self._std
if self.clip_values is not None:
self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
self.ops_were_created = True
def set_session(self, sess):
self.sess = sess
def push(self, x):
x = x.astype('float64')
self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
feed_dict={
self.new_sum: x.sum(axis=0).ravel(),
self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
self.newcount: np.array(len(x), dtype='float64')
})
if self._shape is None:
self._shape = x.shape
@property
def n(self):
return self.sess.run(self._count)
@property
def mean(self):
return self.sess.run(self._mean)
@property
def var(self):
return self.std ** 2
@property
def std(self):
return self.sess.run(self._std)
@property
def shape(self):
return self._shape
@shape.setter
def shape(self, val):
self._shape = val
self.new_sum.set_shape(val)
self.new_sum_squared.set_shape(val)
self.tf_mean.set_shape(val)
self.tf_std.set_shape(val)
self._sum.set_shape(val)
self._sum_squared.set_shape(val)
def normalize(self, batch):
if self.clip_values is not None:
return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
else:
return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})