mirror of
https://github.com/gryf/coach.git
synced 2026-02-13 20:35:48 +01:00
Batch RL Tutorial (#372)
This commit is contained in:
@@ -26,9 +26,9 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class CategoricalQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
|
||||
self.name = 'categorical_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms
|
||||
@@ -37,7 +37,8 @@ class CategoricalQHead(QHead):
|
||||
self.loss_type = []
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
values_distribution = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
|
||||
values_distribution = self.dense_layer(self.num_actions * self.num_atoms)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
|
||||
self.num_atoms))
|
||||
# softmax on atoms dimension
|
||||
|
||||
@@ -27,7 +27,7 @@ from rl_coach.utils import force_list
|
||||
class RegressionHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, scheme=[Dense(256), Dense(256)]):
|
||||
dense_layer=Dense, scheme=[Dense(256), Dense(256)], output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'regression_head'
|
||||
@@ -42,6 +42,7 @@ class RegressionHead(Head):
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.layers.append(input_layer)
|
||||
@@ -50,7 +51,8 @@ class RegressionHead(Head):
|
||||
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
))
|
||||
|
||||
self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output'))
|
||||
self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output',
|
||||
bias_initializer=self.output_bias_initializer))
|
||||
self.output = self.layers[-1]
|
||||
|
||||
def __str__(self):
|
||||
|
||||
@@ -24,9 +24,10 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class DDPGVHead(VHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='normalized_columns'):
|
||||
dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer, initializer=initializer)
|
||||
dense_layer=dense_layer, initializer=initializer,
|
||||
output_bias_initializer=output_bias_initializer)
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
super()._build_module(input_layer)
|
||||
|
||||
@@ -26,18 +26,20 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class PPOVHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'ppo_v_head'
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.return_type = ActionProbabilities
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
|
||||
self.input = [self.old_policy_value]
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
|
||||
|
||||
value_loss_1 = tf.square(self.output - self.target)
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpac
|
||||
class QHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'q_values_head'
|
||||
@@ -46,9 +46,12 @@ class QHead(Head):
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard Q Network
|
||||
self.q_values = self.output = self.dense_layer(self.num_actions)(input_layer, name='output')
|
||||
self.q_values = self.output = self.dense_layer(self.num_actions)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# used in batch-rl to estimate a probablity distribution over actions
|
||||
self.softmax = self.add_softmax_with_temperature()
|
||||
|
||||
@@ -25,9 +25,9 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class QuantileRegressionQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
|
||||
self.name = 'quantile_regression_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
|
||||
@@ -43,7 +43,8 @@ class QuantileRegressionQHead(QHead):
|
||||
self.input = [self.actions, self.quantile_midpoints]
|
||||
|
||||
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
|
||||
quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
|
||||
quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
|
||||
self.output = quantiles_locations
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace
|
||||
class SACQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'q_values_head'
|
||||
@@ -41,6 +41,7 @@ class SACQHead(Head):
|
||||
self.return_type = QActionStateValue
|
||||
# extract the topology from the SACQHeadParameters
|
||||
self.network_layers_sizes = agent_parameters.network_wrappers['q'].heads_parameters[0].network_layers_sizes
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# SAC Q network is basically 2 networks running in parallel on the same input (state , action)
|
||||
@@ -63,7 +64,8 @@ class SACQHead(Head):
|
||||
for layer_size in self.network_layers_sizes[1:]:
|
||||
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
|
||||
# the output layer
|
||||
self.q1_output = self.dense_layer(1)(qi_output, name='q1_output')
|
||||
self.q1_output = self.dense_layer(1)(qi_output, name='q1_output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# build q2 network head
|
||||
with tf.variable_scope("q2_head"):
|
||||
@@ -74,7 +76,8 @@ class SACQHead(Head):
|
||||
for layer_size in self.network_layers_sizes[1:]:
|
||||
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
|
||||
# the output layer
|
||||
self.q2_output = self.dense_layer(1)(qi_output, name='q2_output')
|
||||
self.q2_output = self.dense_layer(1)(qi_output, name='q2_output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# take the minimum as the network's output. this is the log_target (in the original implementation)
|
||||
self.q_output = tf.minimum(self.q1_output, self.q2_output, name='q_output')
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class TD3VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='xavier'):
|
||||
dense_layer=Dense, initializer='xavier', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'td3_v_values_head'
|
||||
@@ -35,6 +35,7 @@ class TD3VHead(Head):
|
||||
self.initializer = initializer
|
||||
self.loss = []
|
||||
self.output = []
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
@@ -44,9 +45,11 @@ class TD3VHead(Head):
|
||||
for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks
|
||||
if self.initializer == 'normalized_columns':
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
|
||||
kernel_initializer=normalized_columns_initializer(1.0)))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer),)
|
||||
elif self.initializer == 'xavier' or self.initializer is None:
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1)))
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
|
||||
bias_initializer=self.output_bias_initializer))
|
||||
|
||||
self.output.append(q_outputs[i])
|
||||
self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2))
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='normalized_columns'):
|
||||
dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'v_values_head'
|
||||
@@ -38,14 +38,17 @@ class VHead(Head):
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
self.initializer = initializer
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
if self.initializer == 'normalized_columns':
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
elif self.initializer == 'xavier' or self.initializer is None:
|
||||
self.output = self.dense_layer(1)(input_layer, name='output')
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
def __str__(self):
|
||||
result = [
|
||||
|
||||
@@ -168,15 +168,18 @@ class Dense(layers.Dense):
|
||||
def __init__(self, units: int):
|
||||
super(Dense, self).__init__(units=units)
|
||||
|
||||
def __call__(self, input_layer, name: str=None, kernel_initializer=None, activation=None, is_training=None):
|
||||
def __call__(self, input_layer, name: str=None, kernel_initializer=None, bias_initializer=None,
|
||||
activation=None, is_training=None):
|
||||
"""
|
||||
returns a tensorflow dense layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: dense layer
|
||||
"""
|
||||
if bias_initializer is None:
|
||||
bias_initializer = tf.zeros_initializer()
|
||||
return tf.layers.dense(input_layer, self.units, name=name, kernel_initializer=kernel_initializer,
|
||||
activation=activation)
|
||||
activation=activation, bias_initializer=bias_initializer)
|
||||
|
||||
@staticmethod
|
||||
@reg_to_tf_instance(layers.Dense)
|
||||
@@ -199,7 +202,8 @@ class NoisyNetDense(layers.NoisyNetDense):
|
||||
def __init__(self, units: int):
|
||||
super(NoisyNetDense, self).__init__(units=units)
|
||||
|
||||
def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None):
|
||||
def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None,
|
||||
bias_initializer=None):
|
||||
"""
|
||||
returns a NoisyNet dense layer
|
||||
:param input_layer: previous layer
|
||||
@@ -233,10 +237,12 @@ class NoisyNetDense(layers.NoisyNetDense):
|
||||
kernel_stddev_initializer = tf.random_uniform_initializer(-stddev * self.sigma0, stddev * self.sigma0)
|
||||
else:
|
||||
kernel_mean_initializer = kernel_stddev_initializer = kernel_initializer
|
||||
if bias_initializer is None:
|
||||
bias_initializer = tf.zeros_initializer()
|
||||
with tf.variable_scope(None, default_name=name):
|
||||
weight_mean = tf.get_variable('weight_mean', shape=(num_inputs, num_outputs),
|
||||
initializer=kernel_mean_initializer)
|
||||
bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=tf.zeros_initializer())
|
||||
bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=bias_initializer)
|
||||
|
||||
weight_stddev = tf.get_variable('weight_stddev', shape=(num_inputs, num_outputs),
|
||||
initializer=kernel_stddev_initializer)
|
||||
|
||||
Reference in New Issue
Block a user