mirror of
https://github.com/gryf/coach.git
synced 2026-02-15 05:25:55 +01:00
TD3 (#338)
This commit is contained in:
@@ -41,14 +41,15 @@ class DDPGCriticNetworkParameters(NetworkParameters):
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [DDPGVHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.adam_optimizer_beta2 = 0.999
|
||||
self.optimizer_epsilon = 1e-8
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.001
|
||||
self.adam_optimizer_beta2 = 0.999
|
||||
self.optimizer_epsilon = 1e-8
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
# self.l2_regularization = 1e-2
|
||||
|
||||
|
||||
class DDPGActorNetworkParameters(NetworkParameters):
|
||||
@@ -58,9 +59,9 @@ class DDPGActorNetworkParameters(NetworkParameters):
|
||||
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
|
||||
self.heads_parameters = [DDPGActorHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 64
|
||||
self.adam_optimizer_beta2 = 0.999
|
||||
self.optimizer_epsilon = 1e-8
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.0001
|
||||
self.create_target_network = True
|
||||
@@ -217,4 +218,4 @@ class DDPGAgent(ActorCriticAgent):
|
||||
action_info = ActionInfo(action=action,
|
||||
action_value=q_value)
|
||||
|
||||
return action_info
|
||||
return action_info
|
||||
@@ -90,7 +90,7 @@ class DDQNBCQAgent(DQNAgent):
|
||||
if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
|
||||
return self.networks['reward_model'].online_network.predict(
|
||||
states,
|
||||
outputs=[self.networks['reward_model'].online_network.state_embedding])
|
||||
outputs=[self.networks['reward_model'].online_network.state_embedding[0]])
|
||||
else:
|
||||
return states['observation']
|
||||
self.embedding = to_embedding
|
||||
@@ -189,7 +189,7 @@ class DDQNBCQAgent(DQNAgent):
|
||||
if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
|
||||
self.knn_trees = [AnnoyDictionary(
|
||||
dict_size=knn_size,
|
||||
key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]),
|
||||
key_width=int(self.networks['reward_model'].online_network.state_embedding[0].shape[-1]),
|
||||
batch_size=knn_size)
|
||||
for _ in range(len(self.spaces.action.actions))]
|
||||
else:
|
||||
|
||||
@@ -194,7 +194,7 @@ class NECAgent(ValueOptimizationAgent):
|
||||
)
|
||||
if self.phase != RunPhase.TEST:
|
||||
# store the state embedding for inserting it to the DND later
|
||||
self.current_episode_state_embeddings.append(embedding.squeeze())
|
||||
self.current_episode_state_embeddings.append(embedding[0].squeeze())
|
||||
actions_q_values = actions_q_values[0][0]
|
||||
return actions_q_values
|
||||
|
||||
|
||||
223
rl_coach/agents/td3_agent.py
Normal file
223
rl_coach/agents/td3_agent.py
Normal file
@@ -0,0 +1,223 @@
|
||||
#
|
||||
# Copyright (c) 2019 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Union
|
||||
from collections import OrderedDict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgent
|
||||
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
|
||||
from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, TD3VHeadParameters
|
||||
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
|
||||
AgentParameters, EmbedderScheme
|
||||
from rl_coach.core_types import ActionInfo, TrainingSteps, Transition
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import BoxActionSpace, GoalsSpace
|
||||
|
||||
|
||||
class TD3CriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self, num_q_networks):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(),
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
|
||||
self.middleware_parameters = FCMiddlewareParameters(num_streams=num_q_networks)
|
||||
self.heads_parameters = [TD3VHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.adam_optimizer_beta2 = 0.999
|
||||
self.optimizer_epsilon = 1e-8
|
||||
self.batch_size = 100
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class TD3ActorNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [DDPGActorHeadParameters(batchnorm=False)]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.adam_optimizer_beta2 = 0.999
|
||||
self.optimizer_epsilon = 1e-8
|
||||
self.batch_size = 100
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class TD3AlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
|
||||
The number of steps between copying the online network weights to the target network weights.
|
||||
|
||||
:param rate_for_copying_weights_to_target: (float)
|
||||
When copying the online network weights to the target network weights, a soft update will be used, which
|
||||
weight the new online network weights by rate_for_copying_weights_to_target
|
||||
|
||||
:param num_consecutive_playing_steps: (StepMethod)
|
||||
The number of consecutive steps to act between every two training iterations
|
||||
|
||||
:param use_target_network_for_evaluation: (bool)
|
||||
If set to True, the target network will be used for predicting the actions when choosing actions to act.
|
||||
Since the target network weights change more slowly, the predicted actions will be more consistent.
|
||||
|
||||
:param action_penalty: (float)
|
||||
The amount by which to penalize the network on high action feature (pre-activation) values.
|
||||
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
|
||||
gradients from becoming very low.
|
||||
|
||||
:param clip_critic_targets: (Tuple[float, float] or None)
|
||||
The range to clip the critic target to in order to prevent overestimation of the action values.
|
||||
|
||||
:param use_non_zero_discount_for_terminal_states: (bool)
|
||||
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
|
||||
values. If set to False, the terminal states reward will be taken as the target return for the network.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.rate_for_copying_weights_to_target = 0.005
|
||||
self.use_target_network_for_evaluation = False
|
||||
self.action_penalty = 0
|
||||
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
|
||||
self.use_non_zero_discount_for_terminal_states = False
|
||||
self.act_for_full_episodes = True
|
||||
self.update_policy_every_x_episode_steps = 2
|
||||
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(self.update_policy_every_x_episode_steps)
|
||||
self.policy_noise = 0.2
|
||||
self.noise_clipping = 0.5
|
||||
self.num_q_networks = 2
|
||||
|
||||
|
||||
class TD3AgentExplorationParameters(AdditiveNoiseParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.noise_as_percentage_from_action_space = False
|
||||
|
||||
|
||||
class TD3AgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
td3_algorithm_params = TD3AlgorithmParameters()
|
||||
super().__init__(algorithm=td3_algorithm_params,
|
||||
exploration=TD3AgentExplorationParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks=OrderedDict([("actor", TD3ActorNetworkParameters()),
|
||||
("critic",
|
||||
TD3CriticNetworkParameters(td3_algorithm_params.num_q_networks))]))
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.td3_agent:TD3Agent'
|
||||
|
||||
|
||||
# Twin Delayed DDPG - https://arxiv.org/pdf/1802.09477.pdf
|
||||
class TD3Agent(DDPGAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.q_values = self.register_signal("Q")
|
||||
self.TD_targets_signal = self.register_signal("TD targets")
|
||||
self.action_signal = self.register_signal("actions")
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
actor = self.networks['actor']
|
||||
critic = self.networks['critic']
|
||||
|
||||
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
|
||||
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
next_actions, actions_mean = actor.parallel_prediction([
|
||||
(actor.target_network, batch.next_states(actor_keys)),
|
||||
(actor.online_network, batch.states(actor_keys))
|
||||
])
|
||||
|
||||
# add noise to the next_actions
|
||||
noise = np.random.normal(0, self.ap.algorithm.policy_noise, next_actions.shape).clip(
|
||||
-self.ap.algorithm.noise_clipping, self.ap.algorithm.noise_clipping)
|
||||
next_actions = self.spaces.action.clip_action_to_space(next_actions + noise)
|
||||
|
||||
critic_inputs = copy.copy(batch.next_states(critic_keys))
|
||||
critic_inputs['action'] = next_actions
|
||||
q_st_plus_1 = critic.target_network.predict(critic_inputs)[2] # output #2 is the min (Q1, Q2)
|
||||
|
||||
# calculate the bootstrapped TD targets while discounting terminal states according to
|
||||
# use_non_zero_discount_for_terminal_states
|
||||
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
|
||||
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
|
||||
else:
|
||||
TD_targets = batch.rewards(expand_dims=True) + \
|
||||
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
|
||||
|
||||
# clip the TD targets to prevent overestimation errors
|
||||
if self.ap.algorithm.clip_critic_targets:
|
||||
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
|
||||
|
||||
self.TD_targets_signal.add_sample(TD_targets)
|
||||
|
||||
# train the critic
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
|
||||
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
if self.training_iteration % self.ap.algorithm.update_policy_every_x_episode_steps == 0:
|
||||
# get the gradients of output #3 (=mean of Q1 network) w.r.t the action
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = actions_mean
|
||||
action_gradients = critic.online_network.predict(critic_inputs,
|
||||
outputs=critic.online_network.gradients_wrt_inputs[3]['action'])
|
||||
|
||||
# apply the gradients from the critic to the actor
|
||||
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
|
||||
gradients = actor.online_network.predict(batch.states(actor_keys),
|
||||
outputs=actor.online_network.weighted_gradients[0],
|
||||
initial_feed_dict=initial_feed_dict)
|
||||
|
||||
if actor.has_global:
|
||||
actor.apply_gradients_to_global_network(gradients)
|
||||
actor.update_online_network()
|
||||
else:
|
||||
actor.apply_gradients_to_online_network(gradients)
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def train(self):
|
||||
self.ap.algorithm.num_consecutive_training_steps = self.current_episode_steps_counter
|
||||
return Agent.train(self)
|
||||
|
||||
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
|
||||
"""
|
||||
Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
|
||||
:param transition: the transition to update
|
||||
:return: the updated transition
|
||||
"""
|
||||
transition.game_over = False if self.current_episode_steps_counter ==\
|
||||
self.parent_level_manager.environment.env._max_episode_steps\
|
||||
else transition.game_over
|
||||
|
||||
return transition
|
||||
@@ -221,3 +221,14 @@ class SACQHeadParameters(HeadParameters):
|
||||
super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer)
|
||||
self.network_layers_sizes = layers_sizes
|
||||
|
||||
|
||||
class TD3VHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='td3_v_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None, initializer='xavier'):
|
||||
super().__init__(parameterized_class_name="TD3VHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
self.initializer = initializer
|
||||
@@ -41,10 +41,11 @@ class FCMiddlewareParameters(MiddlewareParameters):
|
||||
def __init__(self, activation_function='relu',
|
||||
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout_rate: float = 0.0,
|
||||
name="middleware_fc_embedder", dense_layer=None, is_training=False):
|
||||
name="middleware_fc_embedder", dense_layer=None, is_training=False, num_streams=1):
|
||||
super().__init__(parameterized_class_name="FCMiddleware", activation_function=activation_function,
|
||||
scheme=scheme, batchnorm=batchnorm, dropout_rate=dropout_rate, name=name, dense_layer=dense_layer,
|
||||
is_training=is_training)
|
||||
self.num_streams = num_streams
|
||||
|
||||
|
||||
class LSTMMiddlewareParameters(MiddlewareParameters):
|
||||
|
||||
@@ -203,7 +203,6 @@ class TensorFlowArchitecture(Architecture):
|
||||
self._create_gradient_accumulators()
|
||||
|
||||
# gradients of the outputs w.r.t. the inputs
|
||||
# at the moment, this is only used by ddpg
|
||||
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
|
||||
self.inputs.items()} for output in self.outputs]
|
||||
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
|
||||
|
||||
@@ -16,6 +16,7 @@ from .sac_head import SACPolicyHead
|
||||
from .sac_q_head import SACQHead
|
||||
from .classification_head import ClassificationHead
|
||||
from .cil_head import RegressionHead
|
||||
from .td3_v_head import TD3VHead
|
||||
from .ddpg_v_head import DDPGVHead
|
||||
|
||||
__all__ = [
|
||||
@@ -37,5 +38,6 @@ __all__ = [
|
||||
'SACQHead',
|
||||
'ClassificationHead',
|
||||
'RegressionHead',
|
||||
'TD3VHead'
|
||||
'DDPGVHead'
|
||||
]
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Type
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
@@ -22,7 +21,7 @@ from rl_coach.architectures.tensorflow_components.layers import Dense, convert_l
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.utils import squeeze_tensor
|
||||
|
||||
# Used to initialize weights for policy and value output layers
|
||||
def normalized_columns_initializer(std=1.0):
|
||||
@@ -72,8 +71,9 @@ class Head(object):
|
||||
:param input_layer: the input to the graph
|
||||
:return: the output of the last layer and the target placeholder
|
||||
"""
|
||||
|
||||
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
|
||||
self._build_module(input_layer)
|
||||
self._build_module(squeeze_tensor(input_layer))
|
||||
|
||||
self.output = force_list(self.output)
|
||||
self.target = force_list(self.target)
|
||||
|
||||
@@ -0,0 +1,67 @@
|
||||
#
|
||||
# Copyright (c) 2019 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.layers import Dense
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.core_types import VStateValue
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class TD3VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense, initializer='xavier'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'td3_v_values_head'
|
||||
self.return_type = VStateValue
|
||||
self.loss_type = []
|
||||
self.initializer = initializer
|
||||
self.loss = []
|
||||
self.output = []
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
q_outputs = []
|
||||
self.target = tf.placeholder(tf.float32, shape=(None, 1), name="q_networks_min_placeholder")
|
||||
|
||||
for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks
|
||||
if self.initializer == 'normalized_columns':
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
|
||||
kernel_initializer=normalized_columns_initializer(1.0)))
|
||||
elif self.initializer == 'xavier' or self.initializer is None:
|
||||
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1)))
|
||||
|
||||
self.output.append(q_outputs[i])
|
||||
self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2))
|
||||
|
||||
self.output.append(tf.reduce_min(q_outputs, axis=0))
|
||||
self.output.append(tf.reduce_mean(self.output[0]))
|
||||
self.loss = sum(self.loss)
|
||||
tf.losses.add_loss(self.loss)
|
||||
|
||||
def __str__(self):
|
||||
result = [
|
||||
"Q1 Action-Value Stream",
|
||||
"\tDense (num outputs = 1)",
|
||||
"Q2 Action-Value Stream",
|
||||
"\tDense (num outputs = 1)",
|
||||
"Min (Q1, Q2)"
|
||||
]
|
||||
return '\n'.join(result)
|
||||
@@ -28,23 +28,28 @@ class FCMiddleware(Middleware):
|
||||
def __init__(self, activation_function=tf.nn.relu,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout_rate: float = 0.0,
|
||||
name="middleware_fc_embedder", dense_layer=Dense, is_training=False):
|
||||
name="middleware_fc_embedder", dense_layer=Dense, is_training=False, num_streams: int = 1):
|
||||
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
|
||||
dropout_rate=dropout_rate, scheme=scheme, name=name, dense_layer=dense_layer,
|
||||
is_training=is_training)
|
||||
self.return_type = Middleware_FC_Embedding
|
||||
self.layers = []
|
||||
|
||||
assert(isinstance(num_streams, int) and num_streams >= 1)
|
||||
self.num_streams = num_streams
|
||||
|
||||
def _build_module(self):
|
||||
self.layers.append(self.input)
|
||||
self.output = []
|
||||
|
||||
for idx, layer_params in enumerate(self.layers_params):
|
||||
self.layers.extend(force_list(
|
||||
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx),
|
||||
is_training=self.is_training)
|
||||
))
|
||||
for stream_idx in range(self.num_streams):
|
||||
layers = [self.input]
|
||||
|
||||
self.output = self.layers[-1]
|
||||
for idx, layer_params in enumerate(self.layers_params):
|
||||
layers.extend(force_list(
|
||||
layer_params(layers[-1], name='{}_{}'.format(layer_params.__class__.__name__,
|
||||
idx + stream_idx * len(self.layers_params)),
|
||||
is_training=self.is_training)
|
||||
))
|
||||
self.output.append((layers[-1]))
|
||||
|
||||
@property
|
||||
def schemes(self):
|
||||
@@ -72,3 +77,15 @@ class FCMiddleware(Middleware):
|
||||
]
|
||||
}
|
||||
|
||||
def __str__(self):
|
||||
stream = [str(l) for l in self.layers_params]
|
||||
if self.layers_params:
|
||||
if self.num_streams > 1:
|
||||
stream = [''] + ['\t' + l for l in stream]
|
||||
result = stream * self.num_streams
|
||||
result[0::len(stream)] = ['Stream {}'.format(i) for i in range(self.num_streams)]
|
||||
else:
|
||||
result = stream
|
||||
return '\n'.join(result)
|
||||
else:
|
||||
return 'No layers'
|
||||
|
||||
@@ -38,3 +38,10 @@ def get_activation_function(activation_function_string: str):
|
||||
"Activation function must be one of the following {}. instead it was: {}" \
|
||||
.format(activation_functions.keys(), activation_function_string)
|
||||
return activation_functions[activation_function_string]
|
||||
|
||||
|
||||
def squeeze_tensor(tensor):
|
||||
if tensor.shape[0] == 1:
|
||||
return tensor[0]
|
||||
else:
|
||||
return tensor
|
||||
@@ -17,7 +17,6 @@
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionType
|
||||
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
|
||||
@@ -31,8 +30,9 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
class AdditiveNoiseParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise_percentage = 0.05
|
||||
self.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise = 0.05
|
||||
self.noise_as_percentage_from_action_space = True
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
@@ -48,17 +48,19 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
|
||||
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
|
||||
be the mean of the action, and 2nd is assumed to be its standard deviation.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float):
|
||||
def __init__(self, action_space: ActionSpace, noise_schedule: Schedule,
|
||||
evaluation_noise: float, noise_as_percentage_from_action_space: bool = True):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space
|
||||
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
|
||||
:param noise_schedule: the schedule for the noise
|
||||
:param evaluation_noise: the noise variance that will be used during evaluation phases
|
||||
:param noise_as_percentage_from_action_space: a bool deciding whether the noise is absolute or as a percentage
|
||||
from the action space
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.noise_percentage_schedule = noise_percentage_schedule
|
||||
self.evaluation_noise_percentage = evaluation_noise_percentage
|
||||
self.noise_schedule = noise_schedule
|
||||
self.evaluation_noise = evaluation_noise
|
||||
self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space
|
||||
|
||||
if not isinstance(action_space, BoxActionSpace):
|
||||
raise ValueError("Additive noise exploration works only for continuous controls."
|
||||
@@ -68,19 +70,20 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
|
||||
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
|
||||
raise ValueError("Additive noise exploration requires bounded actions")
|
||||
|
||||
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies
|
||||
|
||||
# set the current noise percentage
|
||||
# set the current noise
|
||||
if self.phase == RunPhase.TEST:
|
||||
current_noise_precentage = self.evaluation_noise_percentage
|
||||
current_noise = self.evaluation_noise
|
||||
else:
|
||||
current_noise_precentage = self.noise_percentage_schedule.current_value
|
||||
current_noise = self.noise_schedule.current_value
|
||||
|
||||
# scale the noise to the action space range
|
||||
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
|
||||
if self.noise_as_percentage_from_action_space:
|
||||
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
|
||||
else:
|
||||
action_values_std = current_noise
|
||||
|
||||
# extract the mean values
|
||||
if isinstance(action_values, list):
|
||||
@@ -92,15 +95,18 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
|
||||
|
||||
# step the noise schedule
|
||||
if self.phase is not RunPhase.TEST:
|
||||
self.noise_percentage_schedule.step()
|
||||
self.noise_schedule.step()
|
||||
# the second element of the list is assumed to be the standard deviation
|
||||
if isinstance(action_values, list) and len(action_values) > 1:
|
||||
action_values_std = action_values[1].squeeze()
|
||||
|
||||
# add noise to the action means
|
||||
action = np.random.normal(action_values_mean, action_values_std)
|
||||
if self.phase is not RunPhase.TEST:
|
||||
action = np.random.normal(action_values_mean, action_values_std)
|
||||
else:
|
||||
action = action_values_mean
|
||||
|
||||
return action
|
||||
return np.atleast_1d(action)
|
||||
|
||||
def get_control_param(self):
|
||||
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
|
||||
return np.ones(self.action_space.shape)*self.noise_schedule.current_value
|
||||
|
||||
@@ -32,7 +32,7 @@ class EGreedyParameters(ExplorationParameters):
|
||||
self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
|
||||
self.evaluation_epsilon = 0.05
|
||||
self.continuous_exploration_policy_parameters = AdditiveNoiseParameters()
|
||||
self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
# for continuous control -
|
||||
# (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
|
||||
|
||||
|
||||
@@ -28,10 +28,11 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace
|
||||
class TruncatedNormalParameters(ExplorationParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise_percentage = 0.05
|
||||
self.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
|
||||
self.evaluation_noise = 0.05
|
||||
self.clip_low = 0
|
||||
self.clip_high = 1
|
||||
self.noise_as_percentage_from_action_space = True
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
@@ -49,17 +50,20 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
|
||||
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
|
||||
is within the bounds.
|
||||
"""
|
||||
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
|
||||
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
|
||||
def __init__(self, action_space: ActionSpace, noise_schedule: Schedule,
|
||||
evaluation_noise: float, clip_low: float, clip_high: float,
|
||||
noise_as_percentage_from_action_space: bool = True):
|
||||
"""
|
||||
:param action_space: the action space used by the environment
|
||||
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
|
||||
of the action space
|
||||
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
|
||||
:param noise_schedule: the schedule for the noise variance
|
||||
:param evaluation_noise: the noise variance that will be used during evaluation phases
|
||||
:param noise_as_percentage_from_action_space: whether to consider the noise as a percentage of the action space
|
||||
or absolute value
|
||||
"""
|
||||
super().__init__(action_space)
|
||||
self.noise_percentage_schedule = noise_percentage_schedule
|
||||
self.evaluation_noise_percentage = evaluation_noise_percentage
|
||||
self.noise_schedule = noise_schedule
|
||||
self.evaluation_noise = evaluation_noise
|
||||
self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space
|
||||
self.clip_low = clip_low
|
||||
self.clip_high = clip_high
|
||||
|
||||
@@ -71,17 +75,21 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
|
||||
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
|
||||
raise ValueError("Additive noise exploration requires bounded actions")
|
||||
|
||||
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
|
||||
|
||||
def get_action(self, action_values: List[ActionType]) -> ActionType:
|
||||
# set the current noise percentage
|
||||
# set the current noise
|
||||
if self.phase == RunPhase.TEST:
|
||||
current_noise_precentage = self.evaluation_noise_percentage
|
||||
current_noise = self.evaluation_noise
|
||||
else:
|
||||
current_noise_precentage = self.noise_percentage_schedule.current_value
|
||||
current_noise = self.noise_schedule.current_value
|
||||
|
||||
# scale the noise to the action space range
|
||||
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
|
||||
if self.noise_as_percentage_from_action_space:
|
||||
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
|
||||
else:
|
||||
action_values_std = current_noise
|
||||
|
||||
# scale the noise to the action space range
|
||||
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
|
||||
|
||||
# extract the mean values
|
||||
if isinstance(action_values, list):
|
||||
@@ -93,7 +101,7 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
|
||||
|
||||
# step the noise schedule
|
||||
if self.phase is not RunPhase.TEST:
|
||||
self.noise_percentage_schedule.step()
|
||||
self.noise_schedule.step()
|
||||
# the second element of the list is assumed to be the standard deviation
|
||||
if isinstance(action_values, list) and len(action_values) > 1:
|
||||
action_values_std = action_values[1].squeeze()
|
||||
@@ -107,4 +115,4 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
|
||||
return action
|
||||
|
||||
def get_control_param(self):
|
||||
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
|
||||
return np.ones(self.action_space.shape)*self.noise_schedule.current_value
|
||||
|
||||
@@ -123,8 +123,8 @@ agent_params.input_filter.add_observation_filter(
|
||||
|
||||
# no exploration is used
|
||||
agent_params.exploration = AdditiveNoiseParameters()
|
||||
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0)
|
||||
agent_params.exploration.evaluation_noise_percentage = 0
|
||||
agent_params.exploration.noise_schedule = ConstantSchedule(0)
|
||||
agent_params.exploration.evaluation_noise = 0
|
||||
|
||||
# no playing during the training phase
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)
|
||||
|
||||
@@ -53,7 +53,7 @@ env_params = GymVectorEnvironment(level='CartPole-v0')
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 300
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters(),
|
||||
|
||||
@@ -87,9 +87,9 @@ agent_params.memory.shared_memory = True
|
||||
agent_params.exploration = EGreedyParameters()
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.3)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
# they actually take the noise_percentage_schedule to be 0.2 * max_abs_range which is 0.1 * total_range
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.noise_percentage_schedule = ConstantSchedule(0.1)
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise_percentage = 0
|
||||
# they actually take the noise_schedule to be 0.2 * max_abs_range which is 0.1 * total_range
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.noise_schedule = ConstantSchedule(0.1)
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise = 0
|
||||
|
||||
agent_params.input_filter = InputFilter()
|
||||
agent_params.input_filter.add_observation_filter('observation', 'clipping', ObservationClippingFilter(-200, 200))
|
||||
|
||||
@@ -15,7 +15,7 @@ schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(2000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(10000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
@@ -38,7 +38,7 @@ env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1000
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 3000
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
|
||||
49
rl_coach/presets/Mujoco_TD3.py
Normal file
49
rl_coach/presets/Mujoco_TD3.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from rl_coach.agents.td3_agent import TD3AgentParameters
|
||||
from rl_coach.architectures.layers import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.environments.environment import SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(1000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(5000)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(10000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = TD3AgentParameters()
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)]
|
||||
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)]
|
||||
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Empty
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
|
||||
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(400), Dense(300)]
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 500
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1100
|
||||
preset_validation_params.reward_test_level = 'hopper'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters(),
|
||||
preset_validation_params=preset_validation_params)
|
||||
@@ -37,9 +37,9 @@ agent_params.network_wrappers['main'].input_embedders_parameters = {
|
||||
}
|
||||
|
||||
agent_params.exploration = AdditiveNoiseParameters()
|
||||
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0.05)
|
||||
# agent_params.exploration.noise_percentage_schedule = LinearSchedule(0.4, 0.05, 100000)
|
||||
agent_params.exploration.evaluation_noise_percentage = 0.05
|
||||
agent_params.exploration.noise_schedule = ConstantSchedule(0.05)
|
||||
# agent_params.exploration.noise_schedule = LinearSchedule(0.4, 0.05, 100000)
|
||||
agent_params.exploration.evaluation_noise = 0.05
|
||||
|
||||
agent_params.network_wrappers['main'].batch_size = 64
|
||||
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
|
||||
|
||||
@@ -53,10 +53,14 @@ if __name__ == "__main__":
|
||||
"the preset name, followed by the environment level",
|
||||
default='',
|
||||
type=str)
|
||||
parser.add_argument('-sd', '--level_as_sub_dir',
|
||||
parser.add_argument('-lsd', '--level_as_sub_dir',
|
||||
help="(flag) Store each level in it's own sub directory where the root directory name matches "
|
||||
"the preset name",
|
||||
action='store_true')
|
||||
parser.add_argument('-ssd', '--seed_as_sub_dir',
|
||||
help="(flag) Store each seed in it's own sub directory where the root directory name matches "
|
||||
"the preset name",
|
||||
action='store_true')
|
||||
parser.add_argument('-ew', '--evaluation_worker',
|
||||
help="(flag) Start an additional worker that will only do evaluation",
|
||||
action='store_true')
|
||||
@@ -108,6 +112,8 @@ if __name__ == "__main__":
|
||||
command.append("-c")
|
||||
if args.evaluation_worker:
|
||||
command.append("-ew")
|
||||
if args.seed_as_sub_dir:
|
||||
seed = ''
|
||||
if level is not None:
|
||||
command.extend(['-lvl', '{}'.format(level)])
|
||||
if level_as_sub_dir:
|
||||
|
||||
Reference in New Issue
Block a user