mirror of
https://github.com/gryf/coach.git
synced 2026-02-01 13:25:45 +01:00
Batch RL Tutorial (#372)
This commit is contained in:
@@ -50,43 +50,51 @@ class PPOHeadParameters(HeadParameters):
|
||||
class VHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='v_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns'):
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns',
|
||||
output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="VHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.initializer = initializer
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class DDPGVHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='ddpg_v_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns'):
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='normalized_columns',
|
||||
output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="DDPGVHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.initializer = initializer
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class CategoricalQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None):
|
||||
loss_weight: float = 1.0, dense_layer=None,
|
||||
output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="CategoricalQHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class RegressionHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='q_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None, scheme=None):
|
||||
loss_weight: float = 1.0, dense_layer=None, scheme=None,
|
||||
output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="RegressionHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class DDPGActorHeadParameters(HeadParameters):
|
||||
@@ -153,21 +161,23 @@ class PolicyHeadParameters(HeadParameters):
|
||||
class PPOVHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None):
|
||||
loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="PPOVHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class QHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='q_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None):
|
||||
loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="QHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class ClassificationHeadParameters(HeadParameters):
|
||||
@@ -183,11 +193,12 @@ class ClassificationHeadParameters(HeadParameters):
|
||||
class QuantileRegressionQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None):
|
||||
loss_weight: float = 1.0, dense_layer=None, output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="QuantileRegressionQHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class RainbowQHeadParameters(HeadParameters):
|
||||
@@ -218,18 +229,21 @@ class SACPolicyHeadParameters(HeadParameters):
|
||||
|
||||
class SACQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='sac_q_head_params', dense_layer=None,
|
||||
layers_sizes: tuple = (256, 256)):
|
||||
layers_sizes: tuple = (256, 256), output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer)
|
||||
self.network_layers_sizes = layers_sizes
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
|
||||
class TD3VHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='td3_v_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='xavier'):
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='xavier',
|
||||
output_bias_initializer=None):
|
||||
super().__init__(parameterized_class_name="TD3VHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.initializer = initializer
|
||||
self.initializer = initializer
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
@@ -26,9 +26,9 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class CategoricalQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
|
||||
self.name = 'categorical_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms
|
||||
@@ -37,7 +37,8 @@ class CategoricalQHead(QHead):
|
||||
self.loss_type = []
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
values_distribution = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
|
||||
values_distribution = self.dense_layer(self.num_actions * self.num_atoms)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
|
||||
self.num_atoms))
|
||||
# softmax on atoms dimension
|
||||
|
||||
@@ -27,7 +27,7 @@ from rl_coach.utils import force_list
|
||||
class RegressionHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, scheme=[Dense(256), Dense(256)]):
|
||||
dense_layer=Dense, scheme=[Dense(256), Dense(256)], output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'regression_head'
|
||||
@@ -42,6 +42,7 @@ class RegressionHead(Head):
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.layers.append(input_layer)
|
||||
@@ -50,7 +51,8 @@ class RegressionHead(Head):
|
||||
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
))
|
||||
|
||||
self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output'))
|
||||
self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output',
|
||||
bias_initializer=self.output_bias_initializer))
|
||||
self.output = self.layers[-1]
|
||||
|
||||
def __str__(self):
|
||||
|
||||
@@ -24,9 +24,10 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class DDPGVHead(VHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='normalized_columns'):
|
||||
dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer, initializer=initializer)
|
||||
dense_layer=dense_layer, initializer=initializer,
|
||||
output_bias_initializer=output_bias_initializer)
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
super()._build_module(input_layer)
|
||||
|
||||
@@ -26,18 +26,20 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class PPOVHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'ppo_v_head'
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.return_type = ActionProbabilities
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
|
||||
self.input = [self.old_policy_value]
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
|
||||
|
||||
value_loss_1 = tf.square(self.output - self.target)
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpac
|
||||
class QHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'q_values_head'
|
||||
@@ -46,9 +46,12 @@ class QHead(Head):
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard Q Network
|
||||
self.q_values = self.output = self.dense_layer(self.num_actions)(input_layer, name='output')
|
||||
self.q_values = self.output = self.dense_layer(self.num_actions)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# used in batch-rl to estimate a probablity distribution over actions
|
||||
self.softmax = self.add_softmax_with_temperature()
|
||||
|
||||
@@ -25,9 +25,9 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class QuantileRegressionQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
dense_layer=dense_layer, output_bias_initializer=output_bias_initializer)
|
||||
self.name = 'quantile_regression_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
|
||||
@@ -43,7 +43,8 @@ class QuantileRegressionQHead(QHead):
|
||||
self.input = [self.actions, self.quantile_midpoints]
|
||||
|
||||
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
|
||||
quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)(input_layer, name='output')
|
||||
quantiles_locations = self.dense_layer(self.num_actions * self.num_atoms)\
|
||||
(input_layer, name='output', bias_initializer=self.output_bias_initializer)
|
||||
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
|
||||
self.output = quantiles_locations
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition, BoxActionSpace
|
||||
class SACQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
dense_layer=Dense, output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'q_values_head'
|
||||
@@ -41,6 +41,7 @@ class SACQHead(Head):
|
||||
self.return_type = QActionStateValue
|
||||
# extract the topology from the SACQHeadParameters
|
||||
self.network_layers_sizes = agent_parameters.network_wrappers['q'].heads_parameters[0].network_layers_sizes
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# SAC Q network is basically 2 networks running in parallel on the same input (state , action)
|
||||
@@ -63,7 +64,8 @@ class SACQHead(Head):
|
||||
for layer_size in self.network_layers_sizes[1:]:
|
||||
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
|
||||
# the output layer
|
||||
self.q1_output = self.dense_layer(1)(qi_output, name='q1_output')
|
||||
self.q1_output = self.dense_layer(1)(qi_output, name='q1_output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# build q2 network head
|
||||
with tf.variable_scope("q2_head"):
|
||||
@@ -74,7 +76,8 @@ class SACQHead(Head):
|
||||
for layer_size in self.network_layers_sizes[1:]:
|
||||
qi_output = self.dense_layer(layer_size)(qi_output, activation=self.activation_function)
|
||||
# the output layer
|
||||
self.q2_output = self.dense_layer(1)(qi_output, name='q2_output')
|
||||
self.q2_output = self.dense_layer(1)(qi_output, name='q2_output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
# take the minimum as the network's output. this is the log_target (in the original implementation)
|
||||
self.q_output = tf.minimum(self.q1_output, self.q2_output, name='q_output')
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class TD3VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='xavier'):
|
||||
dense_layer=Dense, initializer='xavier', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'td3_v_values_head'
|
||||
@@ -35,6 +35,7 @@ class TD3VHead(Head):
|
||||
self.initializer = initializer
|
||||
self.loss = []
|
||||
self.output = []
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
@@ -44,9 +45,11 @@ class TD3VHead(Head):
|
||||
for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks
|
||||
if self.initializer == 'normalized_columns':
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
|
||||
kernel_initializer=normalized_columns_initializer(1.0)))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer),)
|
||||
elif self.initializer == 'xavier' or self.initializer is None:
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1)))
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
|
||||
bias_initializer=self.output_bias_initializer))
|
||||
|
||||
self.output.append(q_outputs[i])
|
||||
self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2))
|
||||
|
||||
@@ -26,7 +26,7 @@ from rl_coach.spaces import SpacesDefinition
|
||||
class VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='normalized_columns'):
|
||||
dense_layer=Dense, initializer='normalized_columns', output_bias_initializer=None):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'v_values_head'
|
||||
@@ -38,14 +38,17 @@ class VHead(Head):
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
self.initializer = initializer
|
||||
self.output_bias_initializer = output_bias_initializer
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
if self.initializer == 'normalized_columns':
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
kernel_initializer=normalized_columns_initializer(1.0),
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
elif self.initializer == 'xavier' or self.initializer is None:
|
||||
self.output = self.dense_layer(1)(input_layer, name='output')
|
||||
self.output = self.dense_layer(1)(input_layer, name='output',
|
||||
bias_initializer=self.output_bias_initializer)
|
||||
|
||||
def __str__(self):
|
||||
result = [
|
||||
|
||||
@@ -168,15 +168,18 @@ class Dense(layers.Dense):
|
||||
def __init__(self, units: int):
|
||||
super(Dense, self).__init__(units=units)
|
||||
|
||||
def __call__(self, input_layer, name: str=None, kernel_initializer=None, activation=None, is_training=None):
|
||||
def __call__(self, input_layer, name: str=None, kernel_initializer=None, bias_initializer=None,
|
||||
activation=None, is_training=None):
|
||||
"""
|
||||
returns a tensorflow dense layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: dense layer
|
||||
"""
|
||||
if bias_initializer is None:
|
||||
bias_initializer = tf.zeros_initializer()
|
||||
return tf.layers.dense(input_layer, self.units, name=name, kernel_initializer=kernel_initializer,
|
||||
activation=activation)
|
||||
activation=activation, bias_initializer=bias_initializer)
|
||||
|
||||
@staticmethod
|
||||
@reg_to_tf_instance(layers.Dense)
|
||||
@@ -199,7 +202,8 @@ class NoisyNetDense(layers.NoisyNetDense):
|
||||
def __init__(self, units: int):
|
||||
super(NoisyNetDense, self).__init__(units=units)
|
||||
|
||||
def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None):
|
||||
def __call__(self, input_layer, name: str, kernel_initializer=None, activation=None, is_training=None,
|
||||
bias_initializer=None):
|
||||
"""
|
||||
returns a NoisyNet dense layer
|
||||
:param input_layer: previous layer
|
||||
@@ -233,10 +237,12 @@ class NoisyNetDense(layers.NoisyNetDense):
|
||||
kernel_stddev_initializer = tf.random_uniform_initializer(-stddev * self.sigma0, stddev * self.sigma0)
|
||||
else:
|
||||
kernel_mean_initializer = kernel_stddev_initializer = kernel_initializer
|
||||
if bias_initializer is None:
|
||||
bias_initializer = tf.zeros_initializer()
|
||||
with tf.variable_scope(None, default_name=name):
|
||||
weight_mean = tf.get_variable('weight_mean', shape=(num_inputs, num_outputs),
|
||||
initializer=kernel_mean_initializer)
|
||||
bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=tf.zeros_initializer())
|
||||
bias_mean = tf.get_variable('bias_mean', shape=(num_outputs,), initializer=bias_initializer)
|
||||
|
||||
weight_stddev = tf.get_variable('weight_stddev', shape=(num_inputs, num_outputs),
|
||||
initializer=kernel_stddev_initializer)
|
||||
|
||||
@@ -64,15 +64,14 @@ class RewardNormalizationFilter(RewardFilter):
|
||||
|
||||
def filter(self, reward: RewardType, update_internal_state: bool=True) -> RewardType:
|
||||
if update_internal_state:
|
||||
if not isinstance(reward, np.ndarray) or len(reward.shape) < 2:
|
||||
reward = np.array([[reward]])
|
||||
self.running_rewards_stats.push(reward)
|
||||
|
||||
reward = (reward - self.running_rewards_stats.mean) / \
|
||||
(self.running_rewards_stats.std + 1e-15)
|
||||
reward = np.clip(reward, self.clip_min, self.clip_max)
|
||||
|
||||
return reward
|
||||
return self.running_rewards_stats.normalize(reward).squeeze()
|
||||
|
||||
def get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
|
||||
self.running_rewards_stats.set_params(shape=(1,), clip_values=(self.clip_min, self.clip_max))
|
||||
return input_reward_space
|
||||
|
||||
def save_state_to_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str):
|
||||
|
||||
@@ -37,7 +37,6 @@ from rl_coach.memories.episodic import EpisodicExperienceReplayParameters
|
||||
from rl_coach.core_types import TimeTypes
|
||||
|
||||
|
||||
# TODO build a tutorial for batch RL
|
||||
class BatchRLGraphManager(BasicRLGraphManager):
|
||||
"""
|
||||
A batch RL graph manager creates a scenario of learning from a dataset without a simulator.
|
||||
@@ -95,6 +94,8 @@ class BatchRLGraphManager(BasicRLGraphManager):
|
||||
self.schedule_params = schedule_params
|
||||
|
||||
def _create_graph(self, task_parameters: TaskParameters) -> Tuple[List[LevelManager], List[Environment]]:
|
||||
assert self.agent_params.memory.load_memory_from_file_path or self.env_params, \
|
||||
"BatchRL requires either a dataset to train from or an environment to collect a dataset from. "
|
||||
if self.env_params:
|
||||
# environment loading
|
||||
self.env_params.seed = task_parameters.seed
|
||||
@@ -172,36 +173,38 @@ class BatchRLGraphManager(BasicRLGraphManager):
|
||||
# initialize the network parameters from the global network
|
||||
self.sync()
|
||||
|
||||
# TODO a bug in heatup where the last episode run is not fed into the ER. e.g. asked for 1024 heatup steps,
|
||||
# last ran episode ended increased the total to 1040 steps, but the ER will contain only 1014 steps.
|
||||
# The last episode is not there. Is this a bug in my changes or also on master?
|
||||
# If we have both an environment and a dataset to load from, we will use the environment only for
|
||||
# evaluating the policy, and will not run heatup. If no dataset is available to load from, we will be collecting
|
||||
# a dataset from an environment.
|
||||
if not self.agent_params.memory.load_memory_from_file_path:
|
||||
if self.is_collecting_random_dataset:
|
||||
# heatup
|
||||
if self.env_params is not None:
|
||||
screen.log_title(
|
||||
"Collecting random-action experience to use for training the actual agent in a Batch RL "
|
||||
"fashion")
|
||||
# Creating a random dataset during the heatup phase is useful mainly for tutorial and debug
|
||||
# purposes.
|
||||
self.heatup(self.heatup_steps)
|
||||
else:
|
||||
screen.log_title(
|
||||
"Starting to improve an agent collecting experience to use for training the actual agent in a "
|
||||
"Batch RL fashion")
|
||||
|
||||
# Creating a dataset during the heatup phase is useful mainly for tutorial and debug purposes. If we have both
|
||||
# an environment and a dataset to load from, we will use the environment only for evaluating the policy,
|
||||
# and will not run heatup.
|
||||
# set the experience generating agent to train
|
||||
self.level_managers[0].agents = {'experience_generating_agent': self.experience_generating_agent}
|
||||
|
||||
screen.log_title("Starting to improve an agent collecting experience to use for training the actual agent in a "
|
||||
"Batch RL fashion")
|
||||
# collect a dataset using the experience generating agent
|
||||
super().improve()
|
||||
|
||||
if self.is_collecting_random_dataset:
|
||||
# heatup
|
||||
if self.env_params is not None and not self.agent_params.memory.load_memory_from_file_path:
|
||||
self.heatup(self.heatup_steps)
|
||||
else:
|
||||
# set the experience generating agent to train
|
||||
self.level_managers[0].agents = {'experience_generating_agent': self.experience_generating_agent}
|
||||
# set the acquired experience to the actual agent that we're going to train
|
||||
self.agent.memory = self.experience_generating_agent.memory
|
||||
|
||||
# collect a dataset using the experience generating agent
|
||||
super().improve()
|
||||
# switch the graph scheduling parameters
|
||||
self.set_schedule_params(self.schedule_params)
|
||||
|
||||
# set the acquired experience to the actual agent that we're going to train
|
||||
self.agent.memory = self.experience_generating_agent.memory
|
||||
|
||||
# switch the graph scheduling parameters
|
||||
self.set_schedule_params(self.schedule_params)
|
||||
|
||||
# set the actual agent to train
|
||||
self.level_managers[0].agents = {'agent': self.agent}
|
||||
# set the actual agent to train
|
||||
self.level_managers[0].agents = {'agent': self.agent}
|
||||
|
||||
# this agent never actually plays
|
||||
self.level_managers[0].agents['agent'].ap.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)
|
||||
|
||||
@@ -15,6 +15,8 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import ast
|
||||
|
||||
import pickle
|
||||
from copy import deepcopy
|
||||
|
||||
import math
|
||||
@@ -141,14 +143,27 @@ class EpisodicExperienceReplay(Memory):
|
||||
|
||||
def shuffle_episodes(self):
|
||||
"""
|
||||
Shuffle all the episodes in the replay buffer
|
||||
Shuffle all the complete episodes in the replay buffer, while deleting the last non-complete episode
|
||||
:return:
|
||||
"""
|
||||
self.reader_writer_lock.lock_writing()
|
||||
|
||||
self.assert_not_frozen()
|
||||
|
||||
# unlike the standard usage of the EpisodicExperienceReplay, where we always leave an empty episode after
|
||||
# the last full one, so that new transitions will have where to be added, in this case we delibrately remove
|
||||
# that empty last episode, as we are about to shuffle the memory, and we don't want it to be shuffled in
|
||||
self.remove_last_episode(lock=False)
|
||||
|
||||
random.shuffle(self._buffer)
|
||||
self.transitions = [t for e in self._buffer for t in e.transitions]
|
||||
|
||||
# create a new Episode for the next transitions to be placed into
|
||||
self._buffer.append(Episode(n_step=self.n_step))
|
||||
self._length += 1
|
||||
|
||||
self.reader_writer_lock.release_writing()
|
||||
|
||||
def get_shuffled_training_data_generator(self, size: int) -> List[Transition]:
|
||||
"""
|
||||
Get an generator for iterating through the shuffled replay buffer, for processing the data in epochs.
|
||||
@@ -201,10 +216,10 @@ class EpisodicExperienceReplay(Memory):
|
||||
granularity, size = self.max_size
|
||||
if granularity == MemoryGranularity.Transitions:
|
||||
while size != 0 and self.num_transitions() > size:
|
||||
self._remove_episode(0)
|
||||
self.remove_first_episode(lock=False)
|
||||
elif granularity == MemoryGranularity.Episodes:
|
||||
while self.length() > size:
|
||||
self._remove_episode(0)
|
||||
self.remove_first_episode(lock=False)
|
||||
|
||||
def _update_episode(self, episode: Episode) -> None:
|
||||
episode.update_transitions_rewards_and_bootstrap_data()
|
||||
@@ -321,31 +336,53 @@ class EpisodicExperienceReplay(Memory):
|
||||
|
||||
def _remove_episode(self, episode_index: int) -> None:
|
||||
"""
|
||||
Remove the episode in the given index (even if it is not complete yet)
|
||||
:param episode_index: the index of the episode to remove
|
||||
Remove either the first or the last index
|
||||
:param episode_index: the index of the episode to remove (either 0 or -1)
|
||||
:return: None
|
||||
"""
|
||||
self.assert_not_frozen()
|
||||
assert episode_index == 0 or episode_index == -1, "_remove_episode only supports removing the first or the last " \
|
||||
"episode"
|
||||
|
||||
if len(self._buffer) > episode_index:
|
||||
if len(self._buffer) > 0:
|
||||
episode_length = self._buffer[episode_index].length()
|
||||
self._length -= 1
|
||||
self._num_transitions -= episode_length
|
||||
self._num_transitions_in_complete_episodes -= episode_length
|
||||
del self.transitions[:episode_length]
|
||||
if episode_index == 0:
|
||||
del self.transitions[:episode_length]
|
||||
else: # episode_index = -1
|
||||
del self.transitions[-episode_length:]
|
||||
del self._buffer[episode_index]
|
||||
|
||||
def remove_episode(self, episode_index: int) -> None:
|
||||
def remove_first_episode(self, lock: bool = True) -> None:
|
||||
"""
|
||||
Remove the episode in the given index (even if it is not complete yet)
|
||||
:param episode_index: the index of the episode to remove
|
||||
Remove the first episode (even if it is not complete yet)
|
||||
:param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
|
||||
locks and then calls store with lock = True
|
||||
:return: None
|
||||
"""
|
||||
self.reader_writer_lock.lock_writing_and_reading()
|
||||
if lock:
|
||||
self.reader_writer_lock.lock_writing_and_reading()
|
||||
|
||||
self._remove_episode(episode_index)
|
||||
self._remove_episode(0)
|
||||
if lock:
|
||||
self.reader_writer_lock.release_writing_and_reading()
|
||||
|
||||
self.reader_writer_lock.release_writing_and_reading()
|
||||
def remove_last_episode(self, lock: bool = True) -> None:
|
||||
"""
|
||||
Remove the last episode (even if it is not complete yet)
|
||||
:param lock: if true, will lock the readers writers lock. this can cause a deadlock if an inheriting class
|
||||
locks and then calls store with lock = True
|
||||
:return: None
|
||||
"""
|
||||
if lock:
|
||||
self.reader_writer_lock.lock_writing_and_reading()
|
||||
|
||||
self._remove_episode(-1)
|
||||
|
||||
if lock:
|
||||
self.reader_writer_lock.release_writing_and_reading()
|
||||
|
||||
# for API compatibility
|
||||
def get(self, episode_index: int, lock: bool = True) -> Union[None, Episode]:
|
||||
@@ -372,15 +409,6 @@ class EpisodicExperienceReplay(Memory):
|
||||
|
||||
return episode
|
||||
|
||||
# for API compatibility
|
||||
def remove(self, episode_index: int):
|
||||
"""
|
||||
Remove the episode in the given index (even if it is not complete yet)
|
||||
:param episode_index: the index of the episode to remove
|
||||
:return: None
|
||||
"""
|
||||
self.remove_episode(episode_index)
|
||||
|
||||
def clean(self) -> None:
|
||||
"""
|
||||
Clean the memory by removing all the episodes
|
||||
@@ -446,7 +474,7 @@ class EpisodicExperienceReplay(Memory):
|
||||
|
||||
transitions.append(
|
||||
Transition(state={'observation': state},
|
||||
action=current_transition['action'], reward=current_transition['reward'],
|
||||
action=int(current_transition['action']), reward=current_transition['reward'],
|
||||
next_state={'observation': next_state}, game_over=False,
|
||||
info={'all_action_probabilities':
|
||||
ast.literal_eval(current_transition['all_action_probabilities'])}),
|
||||
@@ -516,3 +544,36 @@ class EpisodicExperienceReplay(Memory):
|
||||
self.last_training_set_episode_id = episode_num
|
||||
self.last_training_set_transition_id = \
|
||||
len([t for e in self.get_all_complete_episodes_from_to(0, self.last_training_set_episode_id + 1) for t in e])
|
||||
|
||||
def save(self, file_path: str) -> None:
|
||||
"""
|
||||
Save the replay buffer contents to a pickle file
|
||||
:param file_path: the path to the file that will be used to store the pickled transitions
|
||||
"""
|
||||
with open(file_path, 'wb') as file:
|
||||
pickle.dump(self.get_all_complete_episodes(), file)
|
||||
|
||||
def load_pickled(self, file_path: str) -> None:
|
||||
"""
|
||||
Restore the replay buffer contents from a pickle file.
|
||||
The pickle file is assumed to include a list of transitions.
|
||||
:param file_path: The path to a pickle file to restore
|
||||
"""
|
||||
self.assert_not_frozen()
|
||||
|
||||
with open(file_path, 'rb') as file:
|
||||
episodes = pickle.load(file)
|
||||
num_transitions = sum([len(e.transitions) for e in episodes])
|
||||
if num_transitions > self.max_size[1]:
|
||||
screen.warning("Warning! The number of transition to load into the replay buffer ({}) is "
|
||||
"bigger than the max size of the replay buffer ({}). The excessive transitions will "
|
||||
"not be stored.".format(num_transitions, self.max_size[1]))
|
||||
|
||||
progress_bar = ProgressBar(len(episodes))
|
||||
for episode_idx, episode in enumerate(episodes):
|
||||
self.store_episode(episode)
|
||||
|
||||
# print progress
|
||||
progress_bar.update(episode_idx)
|
||||
|
||||
progress_bar.close()
|
||||
|
||||
@@ -58,9 +58,6 @@ class Memory(object):
|
||||
def get(self, index):
|
||||
raise NotImplementedError("")
|
||||
|
||||
def remove(self, index):
|
||||
raise NotImplementedError("")
|
||||
|
||||
def length(self):
|
||||
raise NotImplementedError("")
|
||||
|
||||
|
||||
@@ -198,15 +198,6 @@ class ExperienceReplay(Memory):
|
||||
"""
|
||||
return self.get_transition(transition_index, lock)
|
||||
|
||||
# for API compatibility
|
||||
def remove(self, transition_index: int, lock: bool=True):
|
||||
"""
|
||||
Remove the transition in the given index
|
||||
:param transition_index: the index of the transition to remove
|
||||
:return: None
|
||||
"""
|
||||
self.remove_transition(transition_index, lock)
|
||||
|
||||
def clean(self, lock: bool=True) -> None:
|
||||
"""
|
||||
Clean the memory by removing all the episodes
|
||||
|
||||
116
rl_coach/presets/Acrobot_DDQN_BCQ_BatchRL.py
Normal file
116
rl_coach/presets/Acrobot_DDQN_BCQ_BatchRL.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import tensorflow as tf
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, CsvDataset
|
||||
from rl_coach.environments.gym_environment import GymVectorEnvironment
|
||||
from rl_coach.graph_managers.batch_rl_graph_manager import BatchRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
from rl_coach.memories.episodic import EpisodicExperienceReplayParameters
|
||||
from rl_coach.architectures.head_parameters import QHeadParameters
|
||||
from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters
|
||||
|
||||
from rl_coach.agents.ddqn_bcq_agent import KNNParameters
|
||||
|
||||
DATASET_SIZE = 50000
|
||||
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = TrainingSteps(1)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
|
||||
agent_params = DDQNBCQAgentParameters()
|
||||
agent_params.network_wrappers['main'].batch_size = 128
|
||||
# TODO cross-DL framework abstraction for a constant initializer?
|
||||
agent_params.network_wrappers['main'].heads_parameters = [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
|
||||
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
|
||||
100)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
|
||||
agent_params.algorithm.action_drop_method_parameters = KNNParameters()
|
||||
|
||||
# NN configuration
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
agent_params.network_wrappers['main'].softmax_temperature = 0.2
|
||||
|
||||
# ER size
|
||||
agent_params.memory = EpisodicExperienceReplayParameters()
|
||||
# DATATSET_PATH = 'acrobot.csv'
|
||||
# agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, True)
|
||||
|
||||
# E-Greedy schedule
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
|
||||
# Experience Generating Agent parameters
|
||||
experience_generating_agent_params = DDQNAgentParameters()
|
||||
|
||||
# schedule parameters
|
||||
experience_generating_schedule_params = ScheduleParameters()
|
||||
experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
experience_generating_schedule_params.improve_steps = TrainingSteps(
|
||||
DATASET_SIZE - experience_generating_schedule_params.heatup_steps.num_steps)
|
||||
experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
|
||||
# DQN params
|
||||
experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
|
||||
experience_generating_agent_params.algorithm.discount = 0.99
|
||||
experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
|
||||
# NN configuration
|
||||
experience_generating_agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
experience_generating_agent_params.network_wrappers['main'].batch_size = 128
|
||||
experience_generating_agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
|
||||
[QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
|
||||
|
||||
# ER size
|
||||
experience_generating_agent_params.memory = EpisodicExperienceReplayParameters()
|
||||
experience_generating_agent_params.memory.max_size = \
|
||||
(MemoryGranularity.Transitions,
|
||||
experience_generating_schedule_params.heatup_steps.num_steps +
|
||||
experience_generating_schedule_params.improve_steps.num_steps + 1)
|
||||
|
||||
# E-Greedy schedule
|
||||
experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, DATASET_SIZE)
|
||||
experience_generating_agent_params.exploration.evaluation_epsilon = 0
|
||||
|
||||
|
||||
################
|
||||
# Environment #
|
||||
################
|
||||
env_params = GymVectorEnvironment(level='Acrobot-v1')
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 50
|
||||
preset_validation_params.read_csv_tries = 500
|
||||
|
||||
graph_manager = BatchRLGraphManager(agent_params=agent_params,
|
||||
experience_generating_agent_params=experience_generating_agent_params,
|
||||
experience_generating_schedule_params=experience_generating_schedule_params,
|
||||
env_params=env_params,
|
||||
schedule_params=schedule_params,
|
||||
vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),
|
||||
preset_validation_params=preset_validation_params,
|
||||
reward_model_num_epochs=30,
|
||||
train_to_eval_ratio=0.4)
|
||||
Reference in New Issue
Block a user