mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 11:40:18 +01:00
pre-release 0.10.0
This commit is contained in:
664
rl_coach/architectures/tensorflow_components/architecture.py
Normal file
664
rl_coach/architectures/tensorflow_components/architecture.py
Normal file
@@ -0,0 +1,664 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import force_list, squeeze_list
|
||||
|
||||
from rl_coach.architectures.architecture import Architecture
|
||||
from rl_coach.core_types import GradientClippingMethod
|
||||
|
||||
|
||||
def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
|
||||
layers = [input_layer]
|
||||
|
||||
# batchnorm
|
||||
if batchnorm:
|
||||
layers.append(
|
||||
tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# activation
|
||||
if activation_function:
|
||||
layers.append(
|
||||
activation_function(layers[-1], name="activation{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# dropout
|
||||
if dropout:
|
||||
layers.append(
|
||||
tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# remove the input layer from the layers list
|
||||
del layers[0]
|
||||
|
||||
return layers
|
||||
|
||||
|
||||
class Conv2d(object):
|
||||
def __init__(self, params: List):
|
||||
"""
|
||||
:param params: list of [num_filters, kernel_size, strides]
|
||||
"""
|
||||
self.params = params
|
||||
|
||||
def __call__(self, input_layer, name: str):
|
||||
"""
|
||||
returns a tensorflow conv2d layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: conv2d layer
|
||||
"""
|
||||
return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
|
||||
data_format='channels_last', name=name)
|
||||
|
||||
|
||||
class Dense(object):
|
||||
def __init__(self, params: List):
|
||||
"""
|
||||
:param params: list of [num_output_neurons]
|
||||
"""
|
||||
self.params = params
|
||||
|
||||
def __call__(self, input_layer, name: str):
|
||||
"""
|
||||
returns a tensorflow dense layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: dense layer
|
||||
"""
|
||||
return tf.layers.dense(input_layer, self.params[0], name=name)
|
||||
|
||||
|
||||
def variable_summaries(var):
|
||||
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
|
||||
with tf.name_scope('summaries'):
|
||||
layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
|
||||
|
||||
with tf.name_scope(layer_weight_name):
|
||||
mean = tf.reduce_mean(var)
|
||||
tf.summary.scalar('mean', mean)
|
||||
with tf.name_scope('stddev'):
|
||||
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
|
||||
tf.summary.scalar('stddev', stddev)
|
||||
tf.summary.scalar('max', tf.reduce_max(var))
|
||||
tf.summary.scalar('min', tf.reduce_min(var))
|
||||
tf.summary.histogram('histogram', var)
|
||||
|
||||
|
||||
def local_getter(getter, name, *args, **kwargs):
|
||||
"""
|
||||
This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
|
||||
instead of the global variables collection. The local variables collection will hold variables which are not shared
|
||||
between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
|
||||
these variables), but we can calculate the gradients wrt these variables, and we can update their content.
|
||||
"""
|
||||
kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
|
||||
return getter(name, *args, **kwargs)
|
||||
|
||||
|
||||
class TensorFlowArchitecture(Architecture):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
|
||||
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
|
||||
"""
|
||||
:param agent_parameters: the agent parameters
|
||||
:param spaces: the spaces definition of the agent
|
||||
:param name: the name of the network
|
||||
:param global_network: the global network replica that is shared between all the workers
|
||||
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
|
||||
:param network_is_trainable: is the network trainable (we can apply gradients on it)
|
||||
"""
|
||||
super().__init__(agent_parameters, spaces, name)
|
||||
self.middleware = None
|
||||
self.network_is_local = network_is_local
|
||||
self.global_network = global_network
|
||||
if not self.network_parameters.tensorflow_support:
|
||||
raise ValueError('TensorFlow is not supported for this agent')
|
||||
self.sess = None
|
||||
self.inputs = {}
|
||||
self.outputs = []
|
||||
self.targets = []
|
||||
self.importance_weights = []
|
||||
self.losses = []
|
||||
self.total_loss = None
|
||||
self.trainable_weights = []
|
||||
self.weights_placeholders = []
|
||||
self.shared_accumulated_gradients = []
|
||||
self.curr_rnn_c_in = None
|
||||
self.curr_rnn_h_in = None
|
||||
self.gradients_wrt_inputs = []
|
||||
self.train_writer = None
|
||||
self.accumulated_gradients = None
|
||||
self.network_is_trainable = network_is_trainable
|
||||
|
||||
self.is_chief = self.ap.task_parameters.task_index == 0
|
||||
self.network_is_global = not self.network_is_local and global_network is None
|
||||
self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
|
||||
|
||||
self.optimizer_type = self.network_parameters.optimizer_type
|
||||
if self.ap.task_parameters.seed is not None:
|
||||
tf.set_random_seed(self.ap.task_parameters.seed)
|
||||
with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
|
||||
custom_getter=local_getter if network_is_local and global_network else None):
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# build the network
|
||||
self.get_model()
|
||||
|
||||
# model weights
|
||||
self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
|
||||
|
||||
# create the placeholder for the assigning gradients and some tensorboard summaries for the weights
|
||||
for idx, var in enumerate(self.weights):
|
||||
placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
|
||||
self.weights_placeholders.append(placeholder)
|
||||
if self.ap.visualization.tensorboard:
|
||||
variable_summaries(var)
|
||||
|
||||
# create op for assigning a list of weights to the network weights
|
||||
self.update_weights_from_list = [weights.assign(holder) for holder, weights in
|
||||
zip(self.weights_placeholders, self.weights)]
|
||||
|
||||
# locks for synchronous training
|
||||
if self.network_is_global:
|
||||
self._create_locks_for_synchronous_training()
|
||||
|
||||
# gradients ops
|
||||
self._create_gradient_ops()
|
||||
|
||||
# L2 regularization
|
||||
if self.network_parameters.l2_regularization != 0:
|
||||
self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
|
||||
* self.network_parameters.l2_regularization]
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
|
||||
|
||||
self.inc_step = self.global_step.assign_add(1)
|
||||
|
||||
# reset LSTM hidden cells
|
||||
self.reset_internal_memory()
|
||||
|
||||
if self.ap.visualization.tensorboard:
|
||||
current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
|
||||
scope=tf.contrib.framework.get_name_scope())
|
||||
self.merged = tf.summary.merge(current_scope_summaries)
|
||||
|
||||
# initialize or restore model
|
||||
self.init_op = tf.group(
|
||||
tf.global_variables_initializer(),
|
||||
tf.local_variables_initializer()
|
||||
)
|
||||
|
||||
# set the fetches for training
|
||||
self._set_initial_fetch_list()
|
||||
|
||||
def _set_initial_fetch_list(self):
|
||||
"""
|
||||
Create an initial list of tensors to fetch in each training iteration
|
||||
:return: None
|
||||
"""
|
||||
self.train_fetches = [self.gradients_norm]
|
||||
if self.network_parameters.clip_gradients:
|
||||
self.train_fetches.append(self.clipped_grads)
|
||||
else:
|
||||
self.train_fetches.append(self.tensor_gradients)
|
||||
self.train_fetches += [self.total_loss, self.losses]
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
self.train_fetches.append(self.middleware.state_out)
|
||||
self.additional_fetches_start_idx = len(self.train_fetches)
|
||||
|
||||
def _create_locks_for_synchronous_training(self):
|
||||
"""
|
||||
Create locks for synchronizing the different workers during training
|
||||
:return: None
|
||||
"""
|
||||
self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
self.lock = self.lock_counter.assign_add(1, use_locking=True)
|
||||
self.lock_init = self.lock_counter.assign(0)
|
||||
|
||||
self.release_counter = tf.get_variable("release_counter", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
self.release = self.release_counter.assign_add(1, use_locking=True)
|
||||
self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
|
||||
self.release_init = self.release_counter.assign(0)
|
||||
|
||||
def _create_gradient_ops(self):
|
||||
"""
|
||||
Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
|
||||
:return: None
|
||||
"""
|
||||
|
||||
self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
|
||||
self.gradients_norm = tf.global_norm(self.tensor_gradients)
|
||||
|
||||
# gradient clipping
|
||||
if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
|
||||
self._create_gradient_clipping_ops()
|
||||
|
||||
# when using a shared optimizer, we create accumulators to store gradients from all the workers before
|
||||
# applying them
|
||||
if self.distributed_training:
|
||||
self._create_gradient_accumulators()
|
||||
|
||||
# gradients of the outputs w.r.t. the inputs
|
||||
# at the moment, this is only used by ddpg
|
||||
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
|
||||
self.inputs.items()} for output in self.outputs]
|
||||
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
|
||||
for i in range(len(self.outputs))]
|
||||
self.weighted_gradients = []
|
||||
for i in range(len(self.outputs)):
|
||||
unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
|
||||
# unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
|
||||
# self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
|
||||
# unnormalized_gradients)))
|
||||
self.weighted_gradients.append(unnormalized_gradients)
|
||||
|
||||
# defining the optimization process (for LBFGS we have less control over the optimizer)
|
||||
if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
|
||||
self._create_gradient_applying_ops()
|
||||
|
||||
def _create_gradient_accumulators(self):
|
||||
if self.network_is_global:
|
||||
self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
|
||||
self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
|
||||
zip(self.weights_placeholders, self.shared_accumulated_gradients)]
|
||||
self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
|
||||
self.shared_accumulated_gradients]
|
||||
elif self.network_is_local:
|
||||
self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
|
||||
self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
|
||||
|
||||
def _create_gradient_clipping_ops(self):
|
||||
"""
|
||||
Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
|
||||
:return: None
|
||||
"""
|
||||
if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
|
||||
self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
|
||||
self.network_parameters.clip_gradients)
|
||||
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
|
||||
self.clipped_grads = [tf.clip_by_value(grad,
|
||||
-self.network_parameters.clip_gradients,
|
||||
self.network_parameters.clip_gradients)
|
||||
for grad in self.tensor_gradients]
|
||||
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
|
||||
self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
|
||||
for grad in self.tensor_gradients]
|
||||
|
||||
def _create_gradient_applying_ops(self):
|
||||
"""
|
||||
Create tensorflow ops for applying the gradients to the network weights according to the training scheme
|
||||
(distributed training - local or global network, shared optimizer, etc.)
|
||||
:return: None
|
||||
"""
|
||||
if self.network_is_global and self.network_parameters.shared_optimizer and \
|
||||
not self.network_parameters.async_training:
|
||||
# synchronous training with shared optimizer? -> create an operation for applying the gradients
|
||||
# accumulated in the shared gradients accumulator
|
||||
self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.shared_accumulated_gradients, self.weights),
|
||||
global_step=self.global_step)
|
||||
|
||||
elif self.distributed_training and self.network_is_local:
|
||||
# distributed training but independent optimizer? -> create an operation for applying the gradients
|
||||
# to the global weights
|
||||
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
|
||||
|
||||
elif self.network_is_trainable:
|
||||
# not any of the above but is trainable? -> create an operation for applying the gradients to
|
||||
# this network weights
|
||||
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.weights_placeholders, self.weights), global_step=self.global_step)
|
||||
|
||||
def set_session(self, sess):
|
||||
self.sess = sess
|
||||
|
||||
task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
|
||||
# initialize the session parameters in single threaded runs. Otherwise, this is done through the
|
||||
# MonitoredSession object in the graph manager
|
||||
if not task_is_distributed:
|
||||
self.sess.run(self.init_op)
|
||||
|
||||
if self.ap.visualization.tensorboard:
|
||||
# Write the merged summaries to the current experiment directory
|
||||
if not task_is_distributed:
|
||||
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
|
||||
self.train_writer.add_graph(self.sess.graph)
|
||||
elif self.network_is_local:
|
||||
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
|
||||
'/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
|
||||
self.train_writer.add_graph(self.sess.graph)
|
||||
|
||||
# wait for all the workers to set their session
|
||||
if not self.network_is_local:
|
||||
self.wait_for_all_workers_barrier()
|
||||
|
||||
def reset_accumulated_gradients(self):
|
||||
"""
|
||||
Reset the gradients accumulation placeholder
|
||||
"""
|
||||
if self.accumulated_gradients is None:
|
||||
self.accumulated_gradients = self.sess.run(self.weights)
|
||||
|
||||
for ix, grad in enumerate(self.accumulated_gradients):
|
||||
self.accumulated_gradients[ix] = grad * 0
|
||||
|
||||
def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
|
||||
no_accumulation=False):
|
||||
"""
|
||||
Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
|
||||
placeholders
|
||||
:param additional_fetches: Optional tensors to fetch during gradients calculation
|
||||
:param inputs: The input batch for the network
|
||||
:param targets: The targets corresponding to the input batch
|
||||
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won't be scaled
|
||||
:param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
|
||||
replaced by the newely calculated gradients instead of accumulating the new gradients.
|
||||
This can speed up the function runtime by around 10%.
|
||||
:return: A list containing the total loss and the individual network heads losses
|
||||
"""
|
||||
|
||||
if self.accumulated_gradients is None:
|
||||
self.reset_accumulated_gradients()
|
||||
|
||||
# feed inputs
|
||||
if additional_fetches is None:
|
||||
additional_fetches = []
|
||||
feed_dict = self.create_feed_dict(inputs)
|
||||
|
||||
# feed targets
|
||||
targets = force_list(targets)
|
||||
for placeholder_idx, target in enumerate(targets):
|
||||
feed_dict[self.targets[placeholder_idx]] = target
|
||||
|
||||
# feed importance weights
|
||||
importance_weights = force_list(importance_weights)
|
||||
for placeholder_idx, target_ph in enumerate(targets):
|
||||
if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
|
||||
importance_weight = np.ones(target_ph.shape[0])
|
||||
else:
|
||||
importance_weight = importance_weights[placeholder_idx]
|
||||
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
|
||||
|
||||
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
|
||||
|
||||
if self.optimizer_type != 'LBFGS':
|
||||
|
||||
# feed the lstm state if necessary
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
# we can't always assume that we are starting from scratch here can we?
|
||||
feed_dict[self.middleware.c_in] = self.middleware.c_init
|
||||
feed_dict[self.middleware.h_in] = self.middleware.h_init
|
||||
|
||||
fetches = self.train_fetches + additional_fetches
|
||||
if self.ap.visualization.tensorboard:
|
||||
fetches += [self.merged]
|
||||
|
||||
# get grads
|
||||
result = self.sess.run(fetches, feed_dict=feed_dict)
|
||||
if hasattr(self, 'train_writer') and self.train_writer is not None:
|
||||
self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
|
||||
|
||||
# extract the fetches
|
||||
norm_unclipped_grads, grads, total_loss, losses = result[:4]
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
|
||||
fetched_tensors = []
|
||||
if len(additional_fetches) > 0:
|
||||
fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
|
||||
len(additional_fetches)]
|
||||
|
||||
# accumulate the gradients
|
||||
for idx, grad in enumerate(grads):
|
||||
if no_accumulation:
|
||||
self.accumulated_gradients[idx] = grad
|
||||
else:
|
||||
self.accumulated_gradients[idx] += grad
|
||||
|
||||
return total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
|
||||
else:
|
||||
self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
|
||||
|
||||
return [0]
|
||||
|
||||
def create_feed_dict(self, inputs):
|
||||
feed_dict = {}
|
||||
for input_name, input_value in inputs.items():
|
||||
if isinstance(input_name, str):
|
||||
if input_name not in self.inputs:
|
||||
raise ValueError((
|
||||
'input name {input_name} was provided to create a feed '
|
||||
'dictionary, but there is no placeholder with that name. '
|
||||
'placeholder names available include: {placeholder_names}'
|
||||
).format(
|
||||
input_name=input_name,
|
||||
placeholder_names=', '.join(self.inputs.keys())
|
||||
))
|
||||
|
||||
feed_dict[self.inputs[input_name]] = input_value
|
||||
elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
|
||||
feed_dict[input_name] = input_value
|
||||
else:
|
||||
raise ValueError((
|
||||
'input dictionary expects strings or placeholders as keys, '
|
||||
'but found key {key} of type {type}'
|
||||
).format(
|
||||
key=input_name,
|
||||
type=type(input_name),
|
||||
))
|
||||
|
||||
return feed_dict
|
||||
|
||||
def apply_and_reset_gradients(self, gradients, scaler=1.):
|
||||
"""
|
||||
Applies the given gradients to the network weights and resets the accumulation placeholder
|
||||
:param gradients: The gradients to use for the update
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
"""
|
||||
self.apply_gradients(gradients, scaler)
|
||||
self.reset_accumulated_gradients()
|
||||
|
||||
def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
|
||||
"""
|
||||
Waits for all the workers to lock a certain lock and then continues
|
||||
:param lock: the name of the lock to use
|
||||
:param include_only_training_workers: wait only for training workers or for all the workers?
|
||||
:return: None
|
||||
"""
|
||||
if include_only_training_workers:
|
||||
num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
|
||||
else:
|
||||
num_workers_to_wait_for = self.ap.task_parameters.num_tasks
|
||||
|
||||
# lock
|
||||
if hasattr(self, '{}_counter'.format(lock)):
|
||||
self.sess.run(getattr(self, lock))
|
||||
while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
|
||||
time.sleep(0.00001)
|
||||
# self.sess.run(getattr(self, '{}_init'.format(lock)))
|
||||
else:
|
||||
raise ValueError("no counter was defined for the lock {}".format(lock))
|
||||
|
||||
def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
|
||||
"""
|
||||
A barrier that allows waiting for all the workers to finish a certain block of commands
|
||||
:param include_only_training_workers: wait only for training workers or for all the workers?
|
||||
:return: None
|
||||
"""
|
||||
self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
|
||||
self.sess.run(self.lock_init)
|
||||
|
||||
# we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
|
||||
# and then was able to first increase the lock again by one, only to have a late worker to reset it again.
|
||||
# so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
|
||||
|
||||
self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
|
||||
self.sess.run(self.release_init)
|
||||
|
||||
def apply_gradients(self, gradients, scaler=1.):
|
||||
"""
|
||||
Applies the given gradients to the network weights
|
||||
:param gradients: The gradients to use for the update
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them.
|
||||
The gradients will be MULTIPLIED by this factor
|
||||
"""
|
||||
if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
if hasattr(self, 'global_step') and not self.network_is_local:
|
||||
self.sess.run(self.inc_step)
|
||||
|
||||
if self.optimizer_type != 'LBFGS':
|
||||
|
||||
if self.distributed_training and not self.network_parameters.async_training:
|
||||
# rescale the gradients so that they average out with the gradients from the other workers
|
||||
if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
|
||||
scaler /= float(self.ap.task_parameters.num_training_tasks)
|
||||
|
||||
# rescale the gradients
|
||||
if scaler != 1.:
|
||||
for gradient in gradients:
|
||||
gradient *= scaler
|
||||
|
||||
# apply the gradients
|
||||
feed_dict = dict(zip(self.weights_placeholders, gradients))
|
||||
if self.distributed_training and self.network_parameters.shared_optimizer \
|
||||
and not self.network_parameters.async_training:
|
||||
# synchronous distributed training with shared optimizer:
|
||||
# - each worker adds its gradients to the shared gradients accumulators
|
||||
# - we wait for all the workers to add their gradients
|
||||
# - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
|
||||
|
||||
self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
|
||||
|
||||
self.wait_for_all_workers_barrier(include_only_training_workers=True)
|
||||
|
||||
if self.is_chief:
|
||||
self.sess.run(self.update_weights_from_shared_gradients)
|
||||
self.sess.run(self.init_shared_accumulated_gradients)
|
||||
else:
|
||||
# async distributed training / distributed training with independent optimizer
|
||||
# / non-distributed training - just apply the gradients
|
||||
feed_dict = dict(zip(self.weights_placeholders, gradients))
|
||||
self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
|
||||
|
||||
# release barrier
|
||||
if self.distributed_training and not self.network_parameters.async_training:
|
||||
self.wait_for_all_workers_barrier(include_only_training_workers=True)
|
||||
|
||||
def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
|
||||
"""
|
||||
Run a forward pass of the network using the given input
|
||||
:param inputs: The input for the network
|
||||
:param outputs: The output for the network, defaults to self.outputs
|
||||
:param squeeze_output: call squeeze_list on output
|
||||
:param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
|
||||
:return: The network output
|
||||
|
||||
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
|
||||
"""
|
||||
feed_dict = self.create_feed_dict(inputs)
|
||||
if initial_feed_dict:
|
||||
feed_dict.update(initial_feed_dict)
|
||||
if outputs is None:
|
||||
outputs = self.outputs
|
||||
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
|
||||
feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
|
||||
|
||||
output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
|
||||
feed_dict=feed_dict)
|
||||
else:
|
||||
output = self.sess.run(outputs, feed_dict)
|
||||
|
||||
if squeeze_output:
|
||||
output = squeeze_list(output)
|
||||
return output
|
||||
|
||||
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
|
||||
"""
|
||||
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
|
||||
:param additional_fetches: Optional tensors to fetch during the training process
|
||||
:param inputs: The input for the network
|
||||
:param targets: The targets corresponding to the input batch
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won't be scaled
|
||||
:return: The loss of the network
|
||||
"""
|
||||
if additional_fetches is None:
|
||||
additional_fetches = []
|
||||
force_list(additional_fetches)
|
||||
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
|
||||
importance_weights=importance_weights)
|
||||
self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
|
||||
return loss
|
||||
|
||||
def get_weights(self):
|
||||
"""
|
||||
:return: a list of tensors containing the network weights for each layer
|
||||
"""
|
||||
return self.weights
|
||||
|
||||
def set_weights(self, weights, new_rate=1.0):
|
||||
"""
|
||||
Sets the network weights from the given list of weights tensors
|
||||
"""
|
||||
feed_dict = {}
|
||||
old_weights, new_weights = self.sess.run([self.get_weights(), weights])
|
||||
for placeholder_idx, new_weight in enumerate(new_weights):
|
||||
feed_dict[self.weights_placeholders[placeholder_idx]]\
|
||||
= new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
|
||||
self.sess.run(self.update_weights_from_list, feed_dict)
|
||||
|
||||
def get_variable_value(self, variable):
|
||||
"""
|
||||
Get the value of a variable from the graph
|
||||
:param variable: the variable
|
||||
:return: the value of the variable
|
||||
"""
|
||||
return self.sess.run(variable)
|
||||
|
||||
def set_variable_value(self, assign_op, value, placeholder=None):
|
||||
"""
|
||||
Updates the value of a variable.
|
||||
This requires having an assign operation for the variable, and a placeholder which will provide the value
|
||||
:param assign_op: an assign operation for the variable
|
||||
:param value: a value to set the variable to
|
||||
:param placeholder: a placeholder to hold the given value for injecting it into the variable
|
||||
"""
|
||||
self.sess.run(assign_op, feed_dict={placeholder: value})
|
||||
|
||||
def reset_internal_memory(self):
|
||||
"""
|
||||
Reset any internal memory used by the network. For example, an LSTM internal state
|
||||
:return: None
|
||||
"""
|
||||
# initialize LSTM hidden states
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
self.curr_rnn_c_in = self.middleware.c_init
|
||||
self.curr_rnn_h_in = self.middleware.h_init
|
||||
@@ -0,0 +1,102 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
|
||||
"""
|
||||
Creates a ClusterSpec object representing the cluster.
|
||||
:param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
|
||||
:param workers: comma-separated list of hostname:port pairs to which the workers are assigned
|
||||
:return: a ClusterSpec object representing the cluster
|
||||
"""
|
||||
# extract the parameter servers and workers from the given strings
|
||||
ps_hosts = parameters_server.split(",")
|
||||
worker_hosts = workers.split(",")
|
||||
|
||||
# Create a cluster spec from the parameter server and worker hosts
|
||||
cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
|
||||
|
||||
return cluster_spec
|
||||
|
||||
|
||||
def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
|
||||
"""
|
||||
Create and start a parameter server
|
||||
:param cluster_spec: the ClusterSpec object representing the cluster
|
||||
:param config: the tensorflow config to use
|
||||
:return: None
|
||||
"""
|
||||
# create a server object for the parameter server
|
||||
server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
|
||||
|
||||
# wait for the server to finish
|
||||
server.join()
|
||||
|
||||
|
||||
def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
|
||||
use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
|
||||
"""
|
||||
Creates a worker server and a device setter used to assign the workers operations to
|
||||
:param cluster_spec: a ClusterSpec object representing the cluster
|
||||
:param task_index: the index of the worker task
|
||||
:param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
|
||||
:param config: the tensorflow config to use
|
||||
:return: the target string for the tf.Session and the worker device setter object
|
||||
"""
|
||||
# Create and start a worker
|
||||
server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
|
||||
|
||||
# Assign ops to the local worker
|
||||
worker_device = "/job:worker/task:{}".format(task_index)
|
||||
if use_cpu:
|
||||
worker_device += "/cpu:0"
|
||||
else:
|
||||
worker_device += "/device:GPU:0"
|
||||
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
|
||||
|
||||
return server.target, device
|
||||
|
||||
|
||||
def create_monitored_session(target: tf.train.Server, task_index: int,
|
||||
checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
|
||||
"""
|
||||
Create a monitored session for the worker
|
||||
:param target: the target string for the tf.Session
|
||||
:param task_index: the task index of the worker
|
||||
:param checkpoint_dir: a directory path where the checkpoints will be stored
|
||||
:param save_checkpoint_secs: number of seconds between checkpoints storing
|
||||
:param config: the tensorflow configuration (optional)
|
||||
:return: the session to use for the run
|
||||
"""
|
||||
# we chose the first task to be the chief
|
||||
is_chief = task_index == 0
|
||||
|
||||
# Create the monitored session
|
||||
sess = tf.train.MonitoredTrainingSession(
|
||||
master=target,
|
||||
is_chief=is_chief,
|
||||
hooks=[],
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
save_checkpoint_secs=save_checkpoint_secs,
|
||||
config=config
|
||||
)
|
||||
|
||||
return sess
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
from rl_coach.core_types import InputEmbedding
|
||||
|
||||
|
||||
class InputEmbedder(object):
|
||||
"""
|
||||
An input embedder is the first part of the network, which takes the input from the state and produces a vector
|
||||
embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
|
||||
can be multiple embedders in a single network
|
||||
"""
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
|
||||
self.name = name
|
||||
self.input_size = input_size
|
||||
self.activation_function = activation_function
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.dropout_rate = 0
|
||||
self.input = None
|
||||
self.output = None
|
||||
self.scheme = scheme
|
||||
self.return_type = InputEmbedding
|
||||
self.layers = []
|
||||
self.input_rescaling = input_rescaling
|
||||
self.input_offset = input_offset
|
||||
self.input_clipping = input_clipping
|
||||
|
||||
def __call__(self, prev_input_placeholder=None):
|
||||
with tf.variable_scope(self.get_name()):
|
||||
if prev_input_placeholder is None:
|
||||
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
|
||||
else:
|
||||
self.input = prev_input_placeholder
|
||||
self._build_module()
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
|
||||
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
|
||||
# input to the network to be float, which is 4x more expensive in memory.
|
||||
# thus causing each saved transition in the memory to also be 4x more pricier.
|
||||
|
||||
input_layer = self.input / self.input_rescaling
|
||||
input_layer -= self.input_offset
|
||||
# clip input using te given range
|
||||
if self.input_clipping is not None:
|
||||
input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
|
||||
|
||||
self.layers.append(input_layer)
|
||||
|
||||
# layers order is conv -> batchnorm -> activation -> dropout
|
||||
if isinstance(self.scheme, EmbedderScheme):
|
||||
layers_params = self.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
self.output = tf.contrib.layers.flatten(self.layers[-1])
|
||||
|
||||
@property
|
||||
def input_size(self) -> List[int]:
|
||||
return self._input_size
|
||||
|
||||
@input_size.setter
|
||||
def input_size(self, value: Union[int, List[int]]):
|
||||
if isinstance(value, np.ndarray) or isinstance(value, tuple):
|
||||
value = list(value)
|
||||
elif isinstance(value, int):
|
||||
value = [value]
|
||||
if not isinstance(value, list):
|
||||
raise ValueError((
|
||||
'input_size expected to be a list, found {value} which has type {type}'
|
||||
).format(value=value, type=type(value)))
|
||||
self._input_size = value
|
||||
|
||||
@property
|
||||
def schemes(self):
|
||||
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
|
||||
"configurations.")
|
||||
|
||||
def get_name(self):
|
||||
return self.name
|
||||
@@ -0,0 +1,74 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Conv2d
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
|
||||
from rl_coach.core_types import InputImageEmbedding
|
||||
|
||||
|
||||
class ImageEmbedder(InputEmbedder):
|
||||
"""
|
||||
An input embedder that performs convolutions on the input and then flattens the result.
|
||||
The embedder is intended for image like inputs, where the channels are expected to be the last axis.
|
||||
The embedder also allows custom rescaling of the input prior to the neural network.
|
||||
"""
|
||||
schemes = {
|
||||
EmbedderScheme.Empty:
|
||||
[],
|
||||
|
||||
EmbedderScheme.Shallow:
|
||||
[
|
||||
Conv2d([32, 3, 1])
|
||||
],
|
||||
|
||||
# atari dqn
|
||||
EmbedderScheme.Medium:
|
||||
[
|
||||
Conv2d([32, 8, 4]),
|
||||
Conv2d([64, 4, 2]),
|
||||
Conv2d([64, 3, 1])
|
||||
],
|
||||
|
||||
# carla
|
||||
EmbedderScheme.Deep: \
|
||||
[
|
||||
Conv2d([32, 5, 2]),
|
||||
Conv2d([32, 3, 1]),
|
||||
Conv2d([64, 3, 2]),
|
||||
Conv2d([64, 3, 1]),
|
||||
Conv2d([128, 3, 2]),
|
||||
Conv2d([128, 3, 1]),
|
||||
Conv2d([256, 3, 2]),
|
||||
Conv2d([256, 3, 1])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
|
||||
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
|
||||
input_offset, input_clipping)
|
||||
self.return_type = InputImageEmbedding
|
||||
if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
|
||||
raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
|
||||
.format(input_size))
|
||||
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
|
||||
from rl_coach.core_types import InputVectorEmbedding
|
||||
|
||||
|
||||
class VectorEmbedder(InputEmbedder):
|
||||
"""
|
||||
An input embedder that is intended for inputs that can be represented as vectors.
|
||||
The embedder flattens the input, applies several dense layers to it and returns the output.
|
||||
"""
|
||||
schemes = {
|
||||
EmbedderScheme.Empty:
|
||||
[],
|
||||
|
||||
EmbedderScheme.Shallow:
|
||||
[
|
||||
Dense([128])
|
||||
],
|
||||
|
||||
# dqn
|
||||
EmbedderScheme.Medium:
|
||||
[
|
||||
Dense([256])
|
||||
],
|
||||
|
||||
# carla
|
||||
EmbedderScheme.Deep: \
|
||||
[
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
|
||||
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
|
||||
input_rescaling, input_offset, input_clipping)
|
||||
|
||||
self.return_type = InputVectorEmbedding
|
||||
if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
|
||||
raise ValueError("The input size of a vector embedder must contain only a single dimension")
|
||||
344
rl_coach/architectures/tensorflow_components/general_network.py
Normal file
344
rl_coach/architectures/tensorflow_components/general_network.py
Normal file
@@ -0,0 +1,344 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
|
||||
from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
|
||||
from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
|
||||
from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
|
||||
from rl_coach.core_types import PredictionType
|
||||
|
||||
|
||||
class GeneralTensorFlowNetwork(TensorFlowArchitecture):
|
||||
"""
|
||||
A generalized version of all possible networks implemented using tensorflow.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
|
||||
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
|
||||
"""
|
||||
:param agent_parameters: the agent parameters
|
||||
:param spaces: the spaces definition of the agent
|
||||
:param name: the name of the network
|
||||
:param global_network: the global network replica that is shared between all the workers
|
||||
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
|
||||
:param network_is_trainable: is the network trainable (we can apply gradients on it)
|
||||
"""
|
||||
self.global_network = global_network
|
||||
self.network_is_local = network_is_local
|
||||
self.network_wrapper_name = name.split('/')[0]
|
||||
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
|
||||
self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
|
||||
len(self.network_parameters.heads_parameters)
|
||||
self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
|
||||
len(self.network_parameters.heads_parameters)
|
||||
|
||||
self.gradients_from_head_rescalers = []
|
||||
self.gradients_from_head_rescalers_placeholders = []
|
||||
self.update_head_rescaler_value_ops = []
|
||||
|
||||
self.adaptive_learning_rate_scheme = None
|
||||
self.current_learning_rate = None
|
||||
|
||||
# init network modules containers
|
||||
self.input_embedders = []
|
||||
self.output_heads = []
|
||||
super().__init__(agent_parameters, spaces, name, global_network,
|
||||
network_is_local, network_is_trainable)
|
||||
|
||||
def fill_return_types():
|
||||
ret_dict = {}
|
||||
for cls in get_all_subclasses(PredictionType):
|
||||
ret_dict[cls] = []
|
||||
components = self.input_embedders + [self.middleware] + self.output_heads
|
||||
for component in components:
|
||||
if not hasattr(component, 'return_type'):
|
||||
raise ValueError("{} has no return_type attribute. This should not happen.")
|
||||
if component.return_type is not None:
|
||||
ret_dict[component.return_type].append(component)
|
||||
|
||||
return ret_dict
|
||||
|
||||
self.available_return_types = fill_return_types()
|
||||
|
||||
def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
|
||||
prediction_type: PredictionType) -> Dict[str, np.ndarray]:
|
||||
"""
|
||||
Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
|
||||
predictions for it.
|
||||
|
||||
:param states: The input states to the network.
|
||||
:param prediction_type: The requested PredictionType to look for in the network components
|
||||
:return: A dictionary with predictions for all components matching the requested prediction type
|
||||
"""
|
||||
|
||||
ret_dict = {}
|
||||
for component in self.available_return_types[prediction_type]:
|
||||
ret_dict[component] = self.predict(inputs=states, outputs=component.output)
|
||||
|
||||
return ret_dict
|
||||
|
||||
@staticmethod
|
||||
def get_activation_function(activation_function_string: str):
|
||||
"""
|
||||
Map the activation function from a string to the tensorflow framework equivalent
|
||||
:param activation_function_string: the type of the activation function
|
||||
:return: the tensorflow activation function
|
||||
"""
|
||||
activation_functions = {
|
||||
'relu': tf.nn.relu,
|
||||
'tanh': tf.nn.tanh,
|
||||
'sigmoid': tf.nn.sigmoid,
|
||||
'elu': tf.nn.elu,
|
||||
'selu': tf.nn.selu,
|
||||
'leaky_relu': tf.nn.leaky_relu,
|
||||
'none': None
|
||||
}
|
||||
assert activation_function_string in activation_functions.keys(), \
|
||||
"Activation function must be one of the following {}. instead it was: {}"\
|
||||
.format(activation_functions.keys(), activation_function_string)
|
||||
return activation_functions[activation_function_string]
|
||||
|
||||
def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
|
||||
"""
|
||||
Given an input embedder parameters class, creates the input embedder and returns it
|
||||
:param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
|
||||
be a value within the state or the action.
|
||||
:param embedder_params: the parameters of the class of the embedder
|
||||
:return: the embedder instance
|
||||
"""
|
||||
allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
|
||||
allowed_inputs["action"] = copy.copy(self.spaces.action)
|
||||
allowed_inputs["goal"] = copy.copy(self.spaces.goal)
|
||||
|
||||
if input_name not in allowed_inputs.keys():
|
||||
raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
|
||||
.format(input_name, allowed_inputs.keys()))
|
||||
|
||||
type = "vector"
|
||||
if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
|
||||
type = "image"
|
||||
|
||||
embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
|
||||
embedder_params_copy = copy.copy(embedder_params)
|
||||
embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
|
||||
embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
|
||||
embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
|
||||
embedder_params_copy.name = input_name
|
||||
module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
|
||||
path=embedder_path,
|
||||
positional_args=[allowed_inputs[input_name].shape])
|
||||
return module
|
||||
|
||||
def get_middleware(self, middleware_params: MiddlewareParameters):
|
||||
"""
|
||||
Given a middleware type, creates the middleware and returns it
|
||||
:param middleware_params: the paramaeters of the middleware class
|
||||
:return: the middleware instance
|
||||
"""
|
||||
middleware_params_copy = copy.copy(middleware_params)
|
||||
middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
|
||||
module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
|
||||
return module
|
||||
|
||||
def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
|
||||
"""
|
||||
Given a head type, creates the head and returns it
|
||||
:param head_params: the parameters of the head to create
|
||||
:param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
|
||||
the path should be in the following structure: <module_path>:<class_path>
|
||||
:param head_idx: the head index
|
||||
:param loss_weight: the weight to assign for the embedders loss
|
||||
:return: the head
|
||||
"""
|
||||
|
||||
head_params_copy = copy.copy(head_params)
|
||||
head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
|
||||
return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
|
||||
'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
|
||||
'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
|
||||
|
||||
def get_model(self):
|
||||
# validate the configuration
|
||||
if len(self.network_parameters.input_embedders_parameters) == 0:
|
||||
raise ValueError("At least one input type should be defined")
|
||||
|
||||
if len(self.network_parameters.heads_parameters) == 0:
|
||||
raise ValueError("At least one output type should be defined")
|
||||
|
||||
if self.network_parameters.middleware_parameters is None:
|
||||
raise ValueError("Exactly one middleware type should be defined")
|
||||
|
||||
if len(self.network_parameters.loss_weights) == 0:
|
||||
raise ValueError("At least one loss weight should be defined")
|
||||
|
||||
if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
|
||||
raise ValueError("Number of loss weights should match the number of output types")
|
||||
|
||||
for network_idx in range(self.num_networks):
|
||||
with tf.variable_scope('network_{}'.format(network_idx)):
|
||||
|
||||
####################
|
||||
# Input Embeddings #
|
||||
####################
|
||||
|
||||
state_embedding = []
|
||||
for input_name in sorted(self.network_parameters.input_embedders_parameters):
|
||||
input_type = self.network_parameters.input_embedders_parameters[input_name]
|
||||
# get the class of the input embedder
|
||||
input_embedder = self.get_input_embedder(input_name, input_type)
|
||||
self.input_embedders.append(input_embedder)
|
||||
|
||||
# input placeholders are reused between networks. on the first network, store the placeholders
|
||||
# generated by the input_embedders in self.inputs. on the rest of the networks, pass
|
||||
# the existing input_placeholders into the input_embedders.
|
||||
if network_idx == 0:
|
||||
input_placeholder, embedding = input_embedder()
|
||||
self.inputs[input_name] = input_placeholder
|
||||
else:
|
||||
input_placeholder, embedding = input_embedder(self.inputs[input_name])
|
||||
|
||||
state_embedding.append(embedding)
|
||||
|
||||
##########
|
||||
# Merger #
|
||||
##########
|
||||
|
||||
if len(state_embedding) == 1:
|
||||
state_embedding = state_embedding[0]
|
||||
else:
|
||||
if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
|
||||
state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
|
||||
elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
|
||||
state_embedding = tf.add_n(state_embedding, name="merger")
|
||||
|
||||
##############
|
||||
# Middleware #
|
||||
##############
|
||||
|
||||
self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
|
||||
_, self.state_embedding = self.middleware(state_embedding)
|
||||
|
||||
################
|
||||
# Output Heads #
|
||||
################
|
||||
|
||||
head_count = 0
|
||||
for head_idx in range(self.num_heads_per_network):
|
||||
for head_copy_idx in range(self.network_parameters.num_output_head_copies):
|
||||
if self.network_parameters.use_separate_networks_per_head:
|
||||
# if we use separate networks per head, then the head type corresponds top the network idx
|
||||
head_type_idx = network_idx
|
||||
head_count = network_idx
|
||||
else:
|
||||
# if we use a single network with multiple embedders, then the head type is the current head idx
|
||||
head_type_idx = head_idx
|
||||
self.output_heads.append(
|
||||
self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
|
||||
head_copy_idx,
|
||||
self.network_parameters.loss_weights[head_type_idx])
|
||||
)
|
||||
|
||||
# rescale the gradients from the head
|
||||
self.gradients_from_head_rescalers.append(
|
||||
tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
|
||||
initializer=float(
|
||||
self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
|
||||
),
|
||||
dtype=tf.float32))
|
||||
|
||||
self.gradients_from_head_rescalers_placeholders.append(
|
||||
tf.placeholder('float',
|
||||
name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
|
||||
|
||||
self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
|
||||
self.gradients_from_head_rescalers_placeholders[head_count]))
|
||||
|
||||
head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
|
||||
self.gradients_from_head_rescalers[head_count] * self.state_embedding
|
||||
|
||||
# build the head
|
||||
if self.network_is_local:
|
||||
output, target_placeholder, input_placeholders, importance_weight_ph = \
|
||||
self.output_heads[-1](head_input)
|
||||
|
||||
self.targets.extend(target_placeholder)
|
||||
self.importance_weights.extend(importance_weight_ph)
|
||||
else:
|
||||
output, input_placeholders = self.output_heads[-1](head_input)
|
||||
|
||||
self.outputs.extend(output)
|
||||
# TODO: use head names as well
|
||||
for placeholder_index, input_placeholder in enumerate(input_placeholders):
|
||||
self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
|
||||
|
||||
head_count += 1
|
||||
|
||||
# Losses
|
||||
self.losses = tf.losses.get_losses(self.full_name)
|
||||
self.losses += tf.losses.get_regularization_losses(self.full_name)
|
||||
self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
|
||||
# tf.summary.scalar('total_loss', self.total_loss)
|
||||
|
||||
# Learning rate
|
||||
if self.network_parameters.learning_rate_decay_rate != 0:
|
||||
self.adaptive_learning_rate_scheme = \
|
||||
tf.train.exponential_decay(
|
||||
self.network_parameters.learning_rate,
|
||||
self.global_step,
|
||||
decay_steps=self.network_parameters.learning_rate_decay_steps,
|
||||
decay_rate=self.network_parameters.learning_rate_decay_rate,
|
||||
staircase=True)
|
||||
|
||||
self.current_learning_rate = self.adaptive_learning_rate_scheme
|
||||
else:
|
||||
self.current_learning_rate = self.network_parameters.learning_rate
|
||||
|
||||
# Optimizer
|
||||
if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
|
||||
# distributed training + is a local network + optimizer shared -> take the global optimizer
|
||||
self.optimizer = self.global_network.optimizer
|
||||
elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
|
||||
or self.network_parameters.shared_optimizer or not self.distributed_training:
|
||||
# distributed training + is a global network + optimizer shared
|
||||
# OR
|
||||
# distributed training + is a local network + optimizer not shared
|
||||
# OR
|
||||
# non-distributed training
|
||||
# -> create an optimizer
|
||||
|
||||
if self.network_parameters.optimizer_type == 'Adam':
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
|
||||
beta1=self.network_parameters.adam_optimizer_beta1,
|
||||
beta2=self.network_parameters.adam_optimizer_beta2,
|
||||
epsilon=self.network_parameters.optimizer_epsilon)
|
||||
elif self.network_parameters.optimizer_type == 'RMSProp':
|
||||
self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
|
||||
decay=self.network_parameters.rms_prop_optimizer_decay,
|
||||
epsilon=self.network_parameters.optimizer_epsilon)
|
||||
elif self.network_parameters.optimizer_type == 'LBFGS':
|
||||
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
|
||||
options={'maxiter': 25})
|
||||
else:
|
||||
raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))
|
||||
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class CategoricalQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
|
||||
super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class CategoricalQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'categorical_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms
|
||||
self.return_type = QActionStateValue
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = tf.placeholder(tf.int32, [None], name="actions")
|
||||
self.input = [self.actions]
|
||||
|
||||
values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
|
||||
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
|
||||
self.num_atoms))
|
||||
# softmax on atoms dimension
|
||||
self.output = tf.nn.softmax(values_distribution)
|
||||
|
||||
# calculate cross entropy loss
|
||||
self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
|
||||
name="distributions")
|
||||
self.target = self.distributions
|
||||
self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
|
||||
tf.losses.add_loss(self.loss)
|
||||
@@ -0,0 +1,66 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class DDPGActorHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
|
||||
super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
|
||||
self.batchnorm = batchnorm
|
||||
|
||||
|
||||
class DDPGActor(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
|
||||
batchnorm: bool=True):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ddpg_actor_head'
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
self.num_actions = self.spaces.action.shape
|
||||
|
||||
self.batchnorm = batchnorm
|
||||
|
||||
# bounded actions
|
||||
self.output_scale = self.spaces.action.max_abs_range
|
||||
|
||||
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
|
||||
if hasattr(agent_parameters.algorithm, 'action_penalty'):
|
||||
self.action_penalty = agent_parameters.algorithm.action_penalty
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# mean
|
||||
pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
|
||||
policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
|
||||
self.activation_function,
|
||||
False, 0, 0)[-1]
|
||||
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
|
||||
|
||||
if self.is_local:
|
||||
# add a squared penalty on the squared pre-activation features of the action
|
||||
if self.action_penalty and self.action_penalty != 0:
|
||||
self.regularizations += \
|
||||
[self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
|
||||
|
||||
self.output = [self.policy_mean]
|
||||
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.memories.non_episodic import differentiable_neural_dictionary
|
||||
|
||||
|
||||
class DNDQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
|
||||
super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class DNDQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'dnd_q_values_head'
|
||||
self.DND_size = agent_parameters.algorithm.dnd_size
|
||||
self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
|
||||
self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
|
||||
self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
|
||||
self.number_of_nn = agent_parameters.algorithm.number_of_knn
|
||||
self.ap = agent_parameters
|
||||
self.dnd_embeddings = [None] * self.num_actions
|
||||
self.dnd_values = [None] * self.num_actions
|
||||
self.dnd_indices = [None] * self.num_actions
|
||||
self.dnd_distances = [None] * self.num_actions
|
||||
if self.ap.memory.shared_memory:
|
||||
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
|
||||
self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
|
||||
else:
|
||||
self.DND = differentiable_neural_dictionary.QDND(
|
||||
self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
|
||||
key_error_threshold=self.DND_key_error_threshold,
|
||||
learning_rate=self.network_parameters.learning_rate,
|
||||
num_neighbors=self.number_of_nn,
|
||||
override_existing_keys=True)
|
||||
|
||||
# Retrieve info from DND dictionary
|
||||
# We assume that all actions have enough entries in the DND
|
||||
self.output = tf.transpose([
|
||||
self._q_value(input_layer, action)
|
||||
for action in range(self.num_actions)
|
||||
])
|
||||
|
||||
def _q_value(self, input_layer, action):
|
||||
result = tf.py_func(self.DND.query,
|
||||
[input_layer, action, self.number_of_nn],
|
||||
[tf.float64, tf.float64, tf.int64])
|
||||
self.dnd_embeddings[action] = tf.to_float(result[0])
|
||||
self.dnd_values[action] = tf.to_float(result[1])
|
||||
self.dnd_indices[action] = result[2]
|
||||
|
||||
# DND calculation
|
||||
square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
|
||||
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
|
||||
self.dnd_distances[action] = distances
|
||||
weights = 1.0 / distances
|
||||
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
|
||||
q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
|
||||
q_value.set_shape((None,))
|
||||
return q_value
|
||||
|
||||
def _post_build(self):
|
||||
# DND gradients
|
||||
self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
|
||||
self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)
|
||||
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class DuelingQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
|
||||
super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class DuelingQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'dueling_q_values_head'
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# state value tower - V
|
||||
with tf.variable_scope("state_value"):
|
||||
state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
|
||||
state_value = tf.layers.dense(state_value, 1, name='fc2')
|
||||
# state_value = tf.expand_dims(state_value, axis=-1)
|
||||
|
||||
# action advantage tower - A
|
||||
with tf.variable_scope("action_advantage"):
|
||||
action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
|
||||
action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
|
||||
action_advantage = action_advantage - tf.reduce_mean(action_advantage)
|
||||
|
||||
# merge to state-action value function Q
|
||||
self.output = tf.add(state_value, action_advantage, name='output')
|
||||
165
rl_coach/architectures/tensorflow_components/heads/head.py
Normal file
165
rl_coach/architectures/tensorflow_components/heads/head.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Type
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters, Parameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from tensorflow.python.ops.losses.losses_impl import Reduction
|
||||
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
|
||||
# Used to initialize weights for policy and value output layers
|
||||
def normalized_columns_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
class HeadParameters(Parameters):
|
||||
def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
|
||||
super().__init__()
|
||||
self.activation_function = activation_function
|
||||
self.name = name
|
||||
self.parameterized_class_name = parameterized_class.__name__
|
||||
|
||||
|
||||
class Head(object):
|
||||
"""
|
||||
A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
|
||||
a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
|
||||
an assigned loss function. The heads are algorithm dependent.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
|
||||
self.head_idx = head_idx
|
||||
self.network_name = network_name
|
||||
self.network_parameters = agent_parameters.network_wrappers[self.network_name]
|
||||
self.name = "head"
|
||||
self.output = []
|
||||
self.loss = []
|
||||
self.loss_type = []
|
||||
self.regularizations = []
|
||||
self.loss_weight = force_list(loss_weight)
|
||||
self.target = []
|
||||
self.importance_weight = []
|
||||
self.input = []
|
||||
self.is_local = is_local
|
||||
self.ap = agent_parameters
|
||||
self.spaces = spaces
|
||||
self.return_type = None
|
||||
self.activation_function = activation_function
|
||||
|
||||
def __call__(self, input_layer):
|
||||
"""
|
||||
Wrapper for building the module graph including scoping and loss creation
|
||||
:param input_layer: the input to the graph
|
||||
:return: the output of the last layer and the target placeholder
|
||||
"""
|
||||
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
|
||||
self._build_module(input_layer)
|
||||
|
||||
self.output = force_list(self.output)
|
||||
self.target = force_list(self.target)
|
||||
self.input = force_list(self.input)
|
||||
self.loss_type = force_list(self.loss_type)
|
||||
self.loss = force_list(self.loss)
|
||||
self.regularizations = force_list(self.regularizations)
|
||||
if self.is_local:
|
||||
self.set_loss()
|
||||
self._post_build()
|
||||
|
||||
if self.is_local:
|
||||
return self.output, self.target, self.input, self.importance_weight
|
||||
else:
|
||||
return self.output, self.input
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
"""
|
||||
Builds the graph of the module
|
||||
This method is called early on from __call__. It is expected to store the graph
|
||||
in self.output.
|
||||
:param input_layer: the input to the graph
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def _post_build(self):
|
||||
"""
|
||||
Optional function that allows adding any extra definitions after the head has been fully defined
|
||||
For example, this allows doing additional calculations that are based on the loss
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
"""
|
||||
Get a formatted name for the module
|
||||
:return: the formatted name
|
||||
"""
|
||||
return '{}_{}'.format(self.name, self.head_idx)
|
||||
|
||||
def set_loss(self):
|
||||
"""
|
||||
Creates a target placeholder and loss function for each loss_type and regularization
|
||||
:param loss_type: a tensorflow loss function
|
||||
:param scope: the name scope to include the tensors in
|
||||
:return: None
|
||||
"""
|
||||
|
||||
# there are heads that define the loss internally, but we need to create additional placeholders for them
|
||||
for idx in range(len(self.loss)):
|
||||
importance_weight = tf.placeholder('float',
|
||||
[None] + [1] * (len(self.target[idx].shape) - 1),
|
||||
'{}_importance_weight'.format(self.get_name()))
|
||||
self.importance_weight.append(importance_weight)
|
||||
|
||||
# add losses and target placeholder
|
||||
for idx in range(len(self.loss_type)):
|
||||
# create target placeholder
|
||||
target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
|
||||
self.target.append(target)
|
||||
|
||||
# create importance sampling weights placeholder
|
||||
num_target_dims = len(self.target[idx].shape)
|
||||
importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
|
||||
'{}_importance_weight'.format(self.get_name()))
|
||||
self.importance_weight.append(importance_weight)
|
||||
|
||||
# compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
|
||||
# weights the specific loss of this head against other losses in this head or in other heads
|
||||
loss_weight = self.loss_weight[idx]*importance_weight
|
||||
loss = self.loss_type[idx](self.target[-1], self.output[idx],
|
||||
scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
|
||||
|
||||
# the loss is first summed over each sample in the batch and then the mean over the batch is taken
|
||||
loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
|
||||
|
||||
# we add the loss to the losses collection and later we will extract it in general_network
|
||||
tf.losses.add_loss(loss)
|
||||
self.loss.append(loss)
|
||||
|
||||
# add regularizations
|
||||
for regularization in self.regularizations:
|
||||
self.loss.append(regularization)
|
||||
|
||||
@classmethod
|
||||
def path(cls):
|
||||
return cls.__class__.__name__
|
||||
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import Measurements
|
||||
|
||||
|
||||
class MeasurementsPredictionHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
|
||||
super().__init__(parameterized_class=MeasurementsPredictionHead,
|
||||
activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class MeasurementsPredictionHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'future_measurements_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_measurements = self.spaces.state['measurements'].shape[0]
|
||||
self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
|
||||
self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
|
||||
self.return_type = Measurements
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
|
||||
# actions expectation tower (expectation stream) - E
|
||||
with tf.variable_scope("expectation_stream"):
|
||||
expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
|
||||
expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
|
||||
expectation_stream = tf.expand_dims(expectation_stream, axis=1)
|
||||
|
||||
# action fine differences tower (action stream) - A
|
||||
with tf.variable_scope("action_stream"):
|
||||
action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
|
||||
action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
|
||||
name='output')
|
||||
action_stream = tf.reshape(action_stream,
|
||||
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
|
||||
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
|
||||
|
||||
# merge to future measurements predictions
|
||||
self.output = tf.add(expectation_stream, action_stream, name='output')
|
||||
self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
|
||||
name="targets")
|
||||
targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
|
||||
self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
|
||||
tf.losses.add_loss(self.loss_weight[0] * self.loss)
|
||||
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import BoxActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class NAFHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
|
||||
super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class NAFHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
if not isinstance(self.spaces.action, BoxActionSpace):
|
||||
raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
|
||||
|
||||
self.name = 'naf_q_values_head'
|
||||
self.num_actions = self.spaces.action.shape[0]
|
||||
self.output_scale = self.spaces.action.max_abs_range
|
||||
self.return_type = QActionStateValue
|
||||
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# NAF
|
||||
self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
|
||||
self.input = self.action
|
||||
|
||||
# V Head
|
||||
self.V = tf.layers.dense(input_layer, 1, name='V')
|
||||
|
||||
# mu Head
|
||||
mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
|
||||
self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
|
||||
|
||||
# A Head
|
||||
# l_vector is a vector that includes a lower-triangular matrix values
|
||||
self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
|
||||
|
||||
# Convert l to a lower triangular matrix and exponentiate its diagonal
|
||||
|
||||
i = 0
|
||||
columns = []
|
||||
for col in range(self.num_actions):
|
||||
start_row = col
|
||||
num_non_zero_elements = self.num_actions - start_row
|
||||
zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
|
||||
diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
|
||||
non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
|
||||
columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
|
||||
i += num_non_zero_elements
|
||||
self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
|
||||
|
||||
# P = L*L^T
|
||||
self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
|
||||
|
||||
# A = -1/2 * (u - mu)^T * P * (u - mu)
|
||||
action_diff = tf.expand_dims(self.action - self.mu, -1)
|
||||
a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
|
||||
self.A = tf.reshape(a_matrix_form, [-1, 1])
|
||||
|
||||
# Q Head
|
||||
self.Q = tf.add(self.V, self.A, name='Q')
|
||||
|
||||
self.output = self.Q
|
||||
@@ -0,0 +1,151 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
|
||||
|
||||
|
||||
class PolicyHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
|
||||
super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PolicyHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'policy_values_head'
|
||||
self.return_type = ActionProbabilities
|
||||
self.beta = None
|
||||
self.action_penalty = None
|
||||
|
||||
self.exploration_policy = agent_parameters.exploration
|
||||
|
||||
# a scalar weight that penalizes low entropy values to encourage exploration
|
||||
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
|
||||
self.beta = agent_parameters.algorithm.beta_entropy
|
||||
|
||||
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
|
||||
if hasattr(agent_parameters.algorithm, 'action_penalty'):
|
||||
self.action_penalty = agent_parameters.algorithm.action_penalty
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = []
|
||||
self.input = self.actions
|
||||
self.policy_distributions = []
|
||||
self.output = []
|
||||
|
||||
action_spaces = [self.spaces.action]
|
||||
if isinstance(self.spaces.action, CompoundActionSpace):
|
||||
action_spaces = self.spaces.action.sub_action_spaces
|
||||
|
||||
# create a compound action network
|
||||
for action_space_idx, action_space in enumerate(action_spaces):
|
||||
with tf.variable_scope("sub_action_{}".format(action_space_idx)):
|
||||
if isinstance(action_space, DiscreteActionSpace):
|
||||
# create a discrete action network (softmax probabilities output)
|
||||
self._build_discrete_net(input_layer, action_space)
|
||||
elif isinstance(action_space, BoxActionSpace):
|
||||
# create a continuous action network (bounded mean and stdev outputs)
|
||||
self._build_continuous_net(input_layer, action_space)
|
||||
|
||||
if self.is_local:
|
||||
# add entropy regularization
|
||||
if self.beta:
|
||||
self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
|
||||
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
|
||||
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
# calculate loss
|
||||
self.action_log_probs_wrt_policy = \
|
||||
tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
|
||||
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
|
||||
self.target = self.advantages
|
||||
self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
|
||||
tf.losses.add_loss(self.loss_weight[0] * self.loss)
|
||||
|
||||
def _build_discrete_net(self, input_layer, action_space):
|
||||
num_actions = len(action_space.actions)
|
||||
self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
|
||||
|
||||
policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
|
||||
self.policy_probs = tf.nn.softmax(policy_values, name="policy")
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
|
||||
policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
|
||||
self.policy_distributions.append(policy_distribution)
|
||||
self.output.append(self.policy_probs)
|
||||
|
||||
def _build_continuous_net(self, input_layer, action_space):
|
||||
num_actions = action_space.shape
|
||||
self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
|
||||
|
||||
# output activation function
|
||||
if np.all(self.spaces.action.max_abs_range < np.inf):
|
||||
# bounded actions
|
||||
self.output_scale = action_space.max_abs_range
|
||||
self.continuous_output_activation = self.activation_function
|
||||
else:
|
||||
# unbounded actions
|
||||
self.output_scale = 1
|
||||
self.continuous_output_activation = None
|
||||
|
||||
# mean
|
||||
pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
|
||||
policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
|
||||
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
|
||||
|
||||
self.output.append(self.policy_mean)
|
||||
|
||||
# standard deviation
|
||||
if isinstance(self.exploration_policy, ContinuousEntropyParameters):
|
||||
# the stdev is an output of the network and uses a softplus activation as defined in A3C
|
||||
policy_values_std = tf.layers.dense(input_layer, num_actions,
|
||||
kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
|
||||
self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
|
||||
|
||||
self.output.append(self.policy_std)
|
||||
else:
|
||||
# the stdev is an externally given value
|
||||
# Warning: we need to explicitly put this variable in the local variables collections, since defining
|
||||
# it as not trainable puts it for some reason in the global variables collections. If this is not done,
|
||||
# the variable won't be initialized and when working with multiple workers they will get stuck.
|
||||
self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
|
||||
name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
|
||||
# assign op for the policy std
|
||||
self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
|
||||
self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
|
||||
self.policy_distributions.append(policy_distribution)
|
||||
|
||||
if self.is_local:
|
||||
# add a squared penalty on the squared pre-activation features of the action
|
||||
if self.action_penalty and self.action_penalty != 0:
|
||||
self.regularizations += [
|
||||
self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
|
||||
144
rl_coach/architectures/tensorflow_components/heads/ppo_head.py
Normal file
144
rl_coach/architectures/tensorflow_components/heads/ppo_head.py
Normal file
@@ -0,0 +1,144 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class PPOHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
|
||||
super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PPOHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ppo_head'
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
# used in regular PPO
|
||||
self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
|
||||
if self.use_kl_regularization:
|
||||
# kl coefficient and its corresponding assignment operation and placeholder
|
||||
self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
|
||||
trainable=False, name='kl_coefficient')
|
||||
self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
|
||||
self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
|
||||
self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
|
||||
self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
|
||||
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.beta = agent_parameters.algorithm.beta_entropy
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
self._build_discrete_net(input_layer, self.spaces.action)
|
||||
elif isinstance(self.spaces.action, BoxActionSpace):
|
||||
self._build_continuous_net(input_layer, self.spaces.action)
|
||||
else:
|
||||
raise ValueError("only discrete or continuous action spaces are supported for PPO")
|
||||
|
||||
self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
|
||||
self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
|
||||
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
|
||||
|
||||
# Used by regular PPO only
|
||||
# add kl divergence regularization
|
||||
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
|
||||
|
||||
if self.use_kl_regularization:
|
||||
# no clipping => use kl regularization
|
||||
self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
|
||||
self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
|
||||
tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
# calculate surrogate loss
|
||||
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
|
||||
self.target = self.advantages
|
||||
# action_probs_wrt_old_policy != 0 because it is e^...
|
||||
self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
|
||||
if self.clip_likelihood_ratio_using_epsilon is not None:
|
||||
self.clip_param_rescaler = tf.placeholder(tf.float32, ())
|
||||
self.input.append(self.clip_param_rescaler)
|
||||
max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
|
||||
min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
|
||||
self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
|
||||
self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
|
||||
self.clipped_likelihood_ratio * self.advantages)
|
||||
else:
|
||||
self.scaled_advantages = self.likelihood_ratio * self.advantages
|
||||
# minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
|
||||
self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
|
||||
if self.is_local:
|
||||
# add entropy regularization
|
||||
if self.beta:
|
||||
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
|
||||
self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
self.loss = self.surrogate_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
|
||||
def _build_discrete_net(self, input_layer, action_space):
|
||||
num_actions = len(action_space.actions)
|
||||
self.actions = tf.placeholder(tf.int32, [None], name="actions")
|
||||
|
||||
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
|
||||
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
|
||||
|
||||
# Policy Head
|
||||
self.input = [self.actions, self.old_policy_mean]
|
||||
policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
|
||||
self.policy_mean = tf.nn.softmax(policy_values, name="policy")
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
|
||||
self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
|
||||
|
||||
self.output = self.policy_mean
|
||||
|
||||
def _build_continuous_net(self, input_layer, action_space):
|
||||
num_actions = action_space.shape[0]
|
||||
self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
|
||||
|
||||
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
|
||||
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
|
||||
|
||||
self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
|
||||
self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
|
||||
kernel_initializer=normalized_columns_initializer(0.01))
|
||||
if self.is_local:
|
||||
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
|
||||
collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
else:
|
||||
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
|
||||
|
||||
self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
|
||||
self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
|
||||
|
||||
self.output = [self.policy_mean, self.policy_std]
|
||||
@@ -0,0 +1,52 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class PPOVHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
|
||||
super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PPOVHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ppo_v_head'
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
|
||||
self.input = [self.old_policy_value]
|
||||
self.output = tf.layers.dense(input_layer, 1, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
|
||||
|
||||
value_loss_1 = tf.square(self.output - self.target)
|
||||
value_loss_2 = tf.square(self.old_policy_value +
|
||||
tf.clip_by_value(self.output - self.old_policy_value,
|
||||
-self.clip_likelihood_ratio_using_epsilon,
|
||||
self.clip_likelihood_ratio_using_epsilon) - self.target)
|
||||
self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
|
||||
self.loss = self.vf_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
50
rl_coach/architectures/tensorflow_components/heads/q_head.py
Normal file
50
rl_coach/architectures/tensorflow_components/heads/q_head.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class QHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
|
||||
super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class QHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'q_values_head'
|
||||
if isinstance(self.spaces.action, BoxActionSpace):
|
||||
self.num_actions = 1
|
||||
elif isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.return_type = QActionStateValue
|
||||
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard Q Network
|
||||
self.output = tf.layers.dense(input_layer, self.num_actions, name='output')
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class QuantileRegressionQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
|
||||
super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
|
||||
name=name)
|
||||
|
||||
|
||||
class QuantileRegressionQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'quantile_regression_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
|
||||
self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval # k
|
||||
self.return_type = QActionStateValue
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
|
||||
self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
|
||||
self.input = [self.actions, self.quantile_midpoints]
|
||||
|
||||
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
|
||||
quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
|
||||
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
|
||||
self.output = quantiles_locations
|
||||
|
||||
self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
|
||||
self.target = self.quantiles
|
||||
|
||||
# only the quantiles of the taken action are taken into account
|
||||
quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
|
||||
|
||||
# reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
|
||||
# the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
|
||||
# the target quantiles vector is tiled as column of a NxN matrix
|
||||
theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
|
||||
T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
|
||||
tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
|
||||
|
||||
# Huber loss of T(theta_j) - theta_i
|
||||
error = T_theta_j - theta_i
|
||||
abs_error = tf.abs(error)
|
||||
quadratic = tf.minimum(abs_error, self.huber_loss_interval)
|
||||
huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
|
||||
|
||||
# Quantile Huber loss
|
||||
quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
|
||||
|
||||
# Quantile regression loss (the probability for each quantile is 1/num_quantiles)
|
||||
quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
|
||||
self.loss = quantile_regression_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
45
rl_coach/architectures/tensorflow_components/heads/v_head.py
Normal file
45
rl_coach/architectures/tensorflow_components/heads/v_head.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.core_types import VStateValue
|
||||
|
||||
|
||||
class VHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
|
||||
super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'v_values_head'
|
||||
self.return_type = VStateValue
|
||||
|
||||
if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
self.output = tf.layers.dense(input_layer, 1, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
@@ -0,0 +1,86 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Union, List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
|
||||
from rl_coach.base_parameters import MiddlewareScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
|
||||
from rl_coach.core_types import Middleware_FC_Embedding
|
||||
|
||||
|
||||
class FCMiddlewareParameters(MiddlewareParameters):
|
||||
def __init__(self, activation_function='relu',
|
||||
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_fc_embedder"):
|
||||
super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
|
||||
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
|
||||
|
||||
|
||||
class FCMiddleware(Middleware):
|
||||
schemes = {
|
||||
MiddlewareScheme.Empty:
|
||||
[],
|
||||
|
||||
# ppo
|
||||
MiddlewareScheme.Shallow:
|
||||
[
|
||||
Dense([64])
|
||||
],
|
||||
|
||||
# dqn
|
||||
MiddlewareScheme.Medium:
|
||||
[
|
||||
Dense([512])
|
||||
],
|
||||
|
||||
MiddlewareScheme.Deep: \
|
||||
[
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, activation_function=tf.nn.relu,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_fc_embedder"):
|
||||
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
|
||||
dropout=dropout, scheme=scheme, name=name)
|
||||
self.return_type = Middleware_FC_Embedding
|
||||
self.layers = []
|
||||
|
||||
def _build_module(self):
|
||||
self.layers.append(self.input)
|
||||
|
||||
if isinstance(self.scheme, MiddlewareScheme):
|
||||
layers_params = FCMiddleware.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
self.output = self.layers[-1]
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
|
||||
from rl_coach.base_parameters import MiddlewareScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
from rl_coach.core_types import Middleware_LSTM_Embedding
|
||||
|
||||
|
||||
class LSTMMiddlewareParameters(MiddlewareParameters):
|
||||
def __init__(self, activation_function='relu', number_of_lstm_cells=256,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_lstm_embedder"):
|
||||
super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
|
||||
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
|
||||
self.number_of_lstm_cells = number_of_lstm_cells
|
||||
|
||||
|
||||
class LSTMMiddleware(Middleware):
|
||||
schemes = {
|
||||
MiddlewareScheme.Empty:
|
||||
[],
|
||||
|
||||
# ppo
|
||||
MiddlewareScheme.Shallow:
|
||||
[
|
||||
[64]
|
||||
],
|
||||
|
||||
# dqn
|
||||
MiddlewareScheme.Medium:
|
||||
[
|
||||
[512]
|
||||
],
|
||||
|
||||
MiddlewareScheme.Deep: \
|
||||
[
|
||||
[128],
|
||||
[128],
|
||||
[128]
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_lstm_embedder"):
|
||||
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
|
||||
dropout=dropout, scheme=scheme, name=name)
|
||||
self.return_type = Middleware_LSTM_Embedding
|
||||
self.number_of_lstm_cells = number_of_lstm_cells
|
||||
self.layers = []
|
||||
|
||||
def _build_module(self):
|
||||
"""
|
||||
self.state_in: tuple of placeholders containing the initial state
|
||||
self.state_out: tuple of output state
|
||||
|
||||
todo: it appears that the shape of the output is batch, feature
|
||||
the code here seems to be slicing off the first element in the batch
|
||||
which would definitely be wrong. need to double check the shape
|
||||
"""
|
||||
|
||||
self.layers.append(self.input)
|
||||
|
||||
# optionally insert some dense layers before the LSTM
|
||||
if isinstance(self.scheme, MiddlewareScheme):
|
||||
layers_params = LSTMMiddleware.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
# add the LSTM layer
|
||||
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
|
||||
self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
|
||||
self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
|
||||
self.state_init = [self.c_init, self.h_init]
|
||||
self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
|
||||
self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
|
||||
self.state_in = (self.c_in, self.h_in)
|
||||
rnn_in = tf.expand_dims(self.layers[-1], [0])
|
||||
step_size = tf.shape(self.layers[-1])[:1]
|
||||
state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
|
||||
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
|
||||
lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
|
||||
lstm_c, lstm_h = lstm_state
|
||||
self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
|
||||
self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
|
||||
@@ -0,0 +1,68 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Type, Union, List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import MiddlewareScheme, Parameters
|
||||
|
||||
from rl_coach.core_types import MiddlewareEmbedding
|
||||
|
||||
|
||||
class MiddlewareParameters(Parameters):
|
||||
def __init__(self, parameterized_class: Type['Middleware'],
|
||||
activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
|
||||
batchnorm: bool=False, dropout: bool=False,
|
||||
name='middleware'):
|
||||
super().__init__()
|
||||
self.activation_function = activation_function
|
||||
self.scheme = scheme
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.name = name
|
||||
self.parameterized_class_name = parameterized_class.__name__
|
||||
|
||||
|
||||
class Middleware(object):
|
||||
"""
|
||||
A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
|
||||
after they were aggregated in some method (for example, concatenation) and passes it through a neural network
|
||||
which can be customizable but shared between the heads of the network
|
||||
"""
|
||||
def __init__(self, activation_function=tf.nn.relu,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
|
||||
self.name = name
|
||||
self.input = None
|
||||
self.output = None
|
||||
self.activation_function = activation_function
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.dropout_rate = 0
|
||||
self.scheme = scheme
|
||||
self.return_type = MiddlewareEmbedding
|
||||
|
||||
def __call__(self, input_layer):
|
||||
with tf.variable_scope(self.get_name()):
|
||||
self.input = input_layer
|
||||
self._build_module()
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
return self.name
|
||||
121
rl_coach/architectures/tensorflow_components/shared_variables.py
Normal file
121
rl_coach/architectures/tensorflow_components/shared_variables.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class SharedRunningStats(object):
|
||||
def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
|
||||
self.sess = None
|
||||
self.name = name
|
||||
self.replicated_device = replicated_device
|
||||
self.epsilon = epsilon
|
||||
self.ops_were_created = False
|
||||
if create_ops:
|
||||
with tf.device(replicated_device):
|
||||
self.create_ops()
|
||||
|
||||
def create_ops(self, shape=[1], clip_values=None):
|
||||
self.clip_values = clip_values
|
||||
with tf.variable_scope(self.name):
|
||||
self._sum = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
name="running_sum", trainable=False, shape=shape, validate_shape=False,
|
||||
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
self._sum_squared = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
initializer=tf.constant_initializer(self.epsilon),
|
||||
name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
|
||||
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
self._count = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
shape=(),
|
||||
initializer=tf.constant_initializer(self.epsilon),
|
||||
name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
|
||||
self._shape = None
|
||||
self._mean = tf.div(self._sum, self._count, name="mean")
|
||||
self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
|
||||
/ tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
|
||||
self.tf_mean = tf.cast(self._mean, 'float32')
|
||||
self.tf_std = tf.cast(self._std, 'float32')
|
||||
|
||||
self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
|
||||
self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
|
||||
self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
|
||||
|
||||
self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
|
||||
self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
|
||||
self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
|
||||
|
||||
self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
|
||||
self.normalized_obs = (self.raw_obs - self._mean) / self._std
|
||||
if self.clip_values is not None:
|
||||
self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
|
||||
|
||||
self.ops_were_created = True
|
||||
|
||||
def set_session(self, sess):
|
||||
self.sess = sess
|
||||
|
||||
def push(self, x):
|
||||
x = x.astype('float64')
|
||||
self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
|
||||
feed_dict={
|
||||
self.new_sum: x.sum(axis=0).ravel(),
|
||||
self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
|
||||
self.newcount: np.array(len(x), dtype='float64')
|
||||
})
|
||||
if self._shape is None:
|
||||
self._shape = x.shape
|
||||
|
||||
@property
|
||||
def n(self):
|
||||
return self.sess.run(self._count)
|
||||
|
||||
@property
|
||||
def mean(self):
|
||||
return self.sess.run(self._mean)
|
||||
|
||||
@property
|
||||
def var(self):
|
||||
return self.std ** 2
|
||||
|
||||
@property
|
||||
def std(self):
|
||||
return self.sess.run(self._std)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self._shape
|
||||
|
||||
@shape.setter
|
||||
def shape(self, val):
|
||||
self._shape = val
|
||||
self.new_sum.set_shape(val)
|
||||
self.new_sum_squared.set_shape(val)
|
||||
self.tf_mean.set_shape(val)
|
||||
self.tf_std.set_shape(val)
|
||||
self._sum.set_shape(val)
|
||||
self._sum_squared.set_shape(val)
|
||||
|
||||
def normalize(self, batch):
|
||||
if self.clip_values is not None:
|
||||
return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
|
||||
else:
|
||||
return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})
|
||||
Reference in New Issue
Block a user