1
0
mirror of https://github.com/gryf/coach.git synced 2026-02-15 05:25:55 +01:00
This commit is contained in:
Gal Leibovich
2019-06-16 11:11:21 +03:00
committed by GitHub
parent 8df3c46756
commit 7eb884c5b2
107 changed files with 2200 additions and 495 deletions

View File

@@ -41,14 +41,15 @@ class DDPGCriticNetworkParameters(NetworkParameters):
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [DDPGVHeadParameters()]
self.optimizer_type = 'Adam'
self.adam_optimizer_beta2 = 0.999
self.optimizer_epsilon = 1e-8
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.001
self.adam_optimizer_beta2 = 0.999
self.optimizer_epsilon = 1e-8
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
# self.l2_regularization = 1e-2
class DDPGActorNetworkParameters(NetworkParameters):
@@ -58,9 +59,9 @@ class DDPGActorNetworkParameters(NetworkParameters):
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
self.heads_parameters = [DDPGActorHeadParameters()]
self.optimizer_type = 'Adam'
self.batch_size = 64
self.adam_optimizer_beta2 = 0.999
self.optimizer_epsilon = 1e-8
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.0001
self.create_target_network = True
@@ -217,4 +218,4 @@ class DDPGAgent(ActorCriticAgent):
action_info = ActionInfo(action=action,
action_value=q_value)
return action_info
return action_info

View File

@@ -90,7 +90,7 @@ class DDQNBCQAgent(DQNAgent):
if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
return self.networks['reward_model'].online_network.predict(
states,
outputs=[self.networks['reward_model'].online_network.state_embedding])
outputs=[self.networks['reward_model'].online_network.state_embedding[0]])
else:
return states['observation']
self.embedding = to_embedding
@@ -189,7 +189,7 @@ class DDQNBCQAgent(DQNAgent):
if self.ap.algorithm.action_drop_method_parameters.use_state_embedding_instead_of_state:
self.knn_trees = [AnnoyDictionary(
dict_size=knn_size,
key_width=int(self.networks['reward_model'].online_network.state_embedding.shape[-1]),
key_width=int(self.networks['reward_model'].online_network.state_embedding[0].shape[-1]),
batch_size=knn_size)
for _ in range(len(self.spaces.action.actions))]
else:

View File

@@ -194,7 +194,7 @@ class NECAgent(ValueOptimizationAgent):
)
if self.phase != RunPhase.TEST:
# store the state embedding for inserting it to the DND later
self.current_episode_state_embeddings.append(embedding.squeeze())
self.current_episode_state_embeddings.append(embedding[0].squeeze())
actions_q_values = actions_q_values[0][0]
return actions_q_values

View File

@@ -0,0 +1,223 @@
#
# Copyright (c) 2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from typing import Union
from collections import OrderedDict
import numpy as np
from rl_coach.agents.agent import Agent
from rl_coach.agents.ddpg_agent import DDPGAgent
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.architectures.head_parameters import DDPGActorHeadParameters, TD3VHeadParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
AgentParameters, EmbedderScheme
from rl_coach.core_types import ActionInfo, TrainingSteps, Transition
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import BoxActionSpace, GoalsSpace
class TD3CriticNetworkParameters(NetworkParameters):
def __init__(self, num_q_networks):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(),
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
self.middleware_parameters = FCMiddlewareParameters(num_streams=num_q_networks)
self.heads_parameters = [TD3VHeadParameters()]
self.optimizer_type = 'Adam'
self.adam_optimizer_beta2 = 0.999
self.optimizer_epsilon = 1e-8
self.batch_size = 100
self.async_training = False
self.learning_rate = 0.001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class TD3ActorNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [DDPGActorHeadParameters(batchnorm=False)]
self.optimizer_type = 'Adam'
self.adam_optimizer_beta2 = 0.999
self.optimizer_epsilon = 1e-8
self.batch_size = 100
self.async_training = False
self.learning_rate = 0.001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class TD3AlgorithmParameters(AlgorithmParameters):
"""
:param num_steps_between_copying_online_weights_to_target: (StepMethod)
The number of steps between copying the online network weights to the target network weights.
:param rate_for_copying_weights_to_target: (float)
When copying the online network weights to the target network weights, a soft update will be used, which
weight the new online network weights by rate_for_copying_weights_to_target
:param num_consecutive_playing_steps: (StepMethod)
The number of consecutive steps to act between every two training iterations
:param use_target_network_for_evaluation: (bool)
If set to True, the target network will be used for predicting the actions when choosing actions to act.
Since the target network weights change more slowly, the predicted actions will be more consistent.
:param action_penalty: (float)
The amount by which to penalize the network on high action feature (pre-activation) values.
This can prevent the actions features from saturating the TanH activation function, and therefore prevent the
gradients from becoming very low.
:param clip_critic_targets: (Tuple[float, float] or None)
The range to clip the critic target to in order to prevent overestimation of the action values.
:param use_non_zero_discount_for_terminal_states: (bool)
If set to True, the discount factor will be used for terminal states to bootstrap the next predicted state
values. If set to False, the terminal states reward will be taken as the target return for the network.
"""
def __init__(self):
super().__init__()
self.rate_for_copying_weights_to_target = 0.005
self.use_target_network_for_evaluation = False
self.action_penalty = 0
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
self.use_non_zero_discount_for_terminal_states = False
self.act_for_full_episodes = True
self.update_policy_every_x_episode_steps = 2
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(self.update_policy_every_x_episode_steps)
self.policy_noise = 0.2
self.noise_clipping = 0.5
self.num_q_networks = 2
class TD3AgentExplorationParameters(AdditiveNoiseParameters):
def __init__(self):
super().__init__()
self.noise_as_percentage_from_action_space = False
class TD3AgentParameters(AgentParameters):
def __init__(self):
td3_algorithm_params = TD3AlgorithmParameters()
super().__init__(algorithm=td3_algorithm_params,
exploration=TD3AgentExplorationParameters(),
memory=EpisodicExperienceReplayParameters(),
networks=OrderedDict([("actor", TD3ActorNetworkParameters()),
("critic",
TD3CriticNetworkParameters(td3_algorithm_params.num_q_networks))]))
@property
def path(self):
return 'rl_coach.agents.td3_agent:TD3Agent'
# Twin Delayed DDPG - https://arxiv.org/pdf/1802.09477.pdf
class TD3Agent(DDPGAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.q_values = self.register_signal("Q")
self.TD_targets_signal = self.register_signal("TD targets")
self.action_signal = self.register_signal("actions")
def learn_from_batch(self, batch):
actor = self.networks['actor']
critic = self.networks['critic']
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# TD error = r + discount*max(q_st_plus_1) - q_st
next_actions, actions_mean = actor.parallel_prediction([
(actor.target_network, batch.next_states(actor_keys)),
(actor.online_network, batch.states(actor_keys))
])
# add noise to the next_actions
noise = np.random.normal(0, self.ap.algorithm.policy_noise, next_actions.shape).clip(
-self.ap.algorithm.noise_clipping, self.ap.algorithm.noise_clipping)
next_actions = self.spaces.action.clip_action_to_space(next_actions + noise)
critic_inputs = copy.copy(batch.next_states(critic_keys))
critic_inputs['action'] = next_actions
q_st_plus_1 = critic.target_network.predict(critic_inputs)[2] # output #2 is the min (Q1, Q2)
# calculate the bootstrapped TD targets while discounting terminal states according to
# use_non_zero_discount_for_terminal_states
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
else:
TD_targets = batch.rewards(expand_dims=True) + \
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
# clip the TD targets to prevent overestimation errors
if self.ap.algorithm.clip_critic_targets:
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
self.TD_targets_signal.add_sample(TD_targets)
# train the critic
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
if self.training_iteration % self.ap.algorithm.update_policy_every_x_episode_steps == 0:
# get the gradients of output #3 (=mean of Q1 network) w.r.t the action
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = actions_mean
action_gradients = critic.online_network.predict(critic_inputs,
outputs=critic.online_network.gradients_wrt_inputs[3]['action'])
# apply the gradients from the critic to the actor
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
gradients = actor.online_network.predict(batch.states(actor_keys),
outputs=actor.online_network.weighted_gradients[0],
initial_feed_dict=initial_feed_dict)
if actor.has_global:
actor.apply_gradients_to_global_network(gradients)
actor.update_online_network()
else:
actor.apply_gradients_to_online_network(gradients)
return total_loss, losses, unclipped_grads
def train(self):
self.ap.algorithm.num_consecutive_training_steps = self.current_episode_steps_counter
return Agent.train(self)
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
"""
Allows agents to update the transition just before adding it to the replay buffer.
Can be useful for agents that want to tweak the reward, termination signal, etc.
:param transition: the transition to update
:return: the updated transition
"""
transition.game_over = False if self.current_episode_steps_counter ==\
self.parent_level_manager.environment.env._max_episode_steps\
else transition.game_over
return transition

View File

@@ -221,3 +221,14 @@ class SACQHeadParameters(HeadParameters):
super().__init__(parameterized_class_name='SACQHead', activation_function=activation_function, name=name,
dense_layer=dense_layer)
self.network_layers_sizes = layers_sizes
class TD3VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='td3_v_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
loss_weight: float = 1.0, dense_layer=None, initializer='xavier'):
super().__init__(parameterized_class_name="TD3VHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
self.initializer = initializer

View File

@@ -41,10 +41,11 @@ class FCMiddlewareParameters(MiddlewareParameters):
def __init__(self, activation_function='relu',
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout_rate: float = 0.0,
name="middleware_fc_embedder", dense_layer=None, is_training=False):
name="middleware_fc_embedder", dense_layer=None, is_training=False, num_streams=1):
super().__init__(parameterized_class_name="FCMiddleware", activation_function=activation_function,
scheme=scheme, batchnorm=batchnorm, dropout_rate=dropout_rate, name=name, dense_layer=dense_layer,
is_training=is_training)
self.num_streams = num_streams
class LSTMMiddlewareParameters(MiddlewareParameters):

View File

@@ -203,7 +203,6 @@ class TensorFlowArchitecture(Architecture):
self._create_gradient_accumulators()
# gradients of the outputs w.r.t. the inputs
# at the moment, this is only used by ddpg
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
self.inputs.items()} for output in self.outputs]
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')

View File

@@ -16,6 +16,7 @@ from .sac_head import SACPolicyHead
from .sac_q_head import SACQHead
from .classification_head import ClassificationHead
from .cil_head import RegressionHead
from .td3_v_head import TD3VHead
from .ddpg_v_head import DDPGVHead
__all__ = [
@@ -37,5 +38,6 @@ __all__ = [
'SACQHead',
'ClassificationHead',
'RegressionHead',
'TD3VHead'
'DDPGVHead'
]

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Type
import numpy as np
import tensorflow as tf
@@ -22,7 +21,7 @@ from rl_coach.architectures.tensorflow_components.layers import Dense, convert_l
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import force_list
from rl_coach.architectures.tensorflow_components.utils import squeeze_tensor
# Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
@@ -72,8 +71,9 @@ class Head(object):
:param input_layer: the input to the graph
:return: the output of the last layer and the target placeholder
"""
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
self._build_module(input_layer)
self._build_module(squeeze_tensor(input_layer))
self.output = force_list(self.output)
self.target = force_list(self.target)

View File

@@ -0,0 +1,67 @@
#
# Copyright (c) 2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.layers import Dense
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer
from rl_coach.base_parameters import AgentParameters
from rl_coach.core_types import VStateValue
from rl_coach.spaces import SpacesDefinition
class TD3VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
dense_layer=Dense, initializer='xavier'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'td3_v_values_head'
self.return_type = VStateValue
self.loss_type = []
self.initializer = initializer
self.loss = []
self.output = []
def _build_module(self, input_layer):
# Standard V Network
q_outputs = []
self.target = tf.placeholder(tf.float32, shape=(None, 1), name="q_networks_min_placeholder")
for i in range(input_layer.shape[0]): # assuming that the actual size is 2, as there are two critic networks
if self.initializer == 'normalized_columns':
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1),
kernel_initializer=normalized_columns_initializer(1.0)))
elif self.initializer == 'xavier' or self.initializer is None:
q_outputs.append(self.dense_layer(1)(input_layer[i], name='q_output_{}'.format(i + 1)))
self.output.append(q_outputs[i])
self.loss.append(tf.reduce_mean((self.target-q_outputs[i])**2))
self.output.append(tf.reduce_min(q_outputs, axis=0))
self.output.append(tf.reduce_mean(self.output[0]))
self.loss = sum(self.loss)
tf.losses.add_loss(self.loss)
def __str__(self):
result = [
"Q1 Action-Value Stream",
"\tDense (num outputs = 1)",
"Q2 Action-Value Stream",
"\tDense (num outputs = 1)",
"Min (Q1, Q2)"
]
return '\n'.join(result)

View File

@@ -28,23 +28,28 @@ class FCMiddleware(Middleware):
def __init__(self, activation_function=tf.nn.relu,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout_rate: float = 0.0,
name="middleware_fc_embedder", dense_layer=Dense, is_training=False):
name="middleware_fc_embedder", dense_layer=Dense, is_training=False, num_streams: int = 1):
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
dropout_rate=dropout_rate, scheme=scheme, name=name, dense_layer=dense_layer,
is_training=is_training)
self.return_type = Middleware_FC_Embedding
self.layers = []
assert(isinstance(num_streams, int) and num_streams >= 1)
self.num_streams = num_streams
def _build_module(self):
self.layers.append(self.input)
self.output = []
for idx, layer_params in enumerate(self.layers_params):
self.layers.extend(force_list(
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx),
is_training=self.is_training)
))
for stream_idx in range(self.num_streams):
layers = [self.input]
self.output = self.layers[-1]
for idx, layer_params in enumerate(self.layers_params):
layers.extend(force_list(
layer_params(layers[-1], name='{}_{}'.format(layer_params.__class__.__name__,
idx + stream_idx * len(self.layers_params)),
is_training=self.is_training)
))
self.output.append((layers[-1]))
@property
def schemes(self):
@@ -72,3 +77,15 @@ class FCMiddleware(Middleware):
]
}
def __str__(self):
stream = [str(l) for l in self.layers_params]
if self.layers_params:
if self.num_streams > 1:
stream = [''] + ['\t' + l for l in stream]
result = stream * self.num_streams
result[0::len(stream)] = ['Stream {}'.format(i) for i in range(self.num_streams)]
else:
result = stream
return '\n'.join(result)
else:
return 'No layers'

View File

@@ -38,3 +38,10 @@ def get_activation_function(activation_function_string: str):
"Activation function must be one of the following {}. instead it was: {}" \
.format(activation_functions.keys(), activation_function_string)
return activation_functions[activation_function_string]
def squeeze_tensor(tensor):
if tensor.shape[0] == 1:
return tensor[0]
else:
return tensor

View File

@@ -17,7 +17,6 @@
from typing import List
import numpy as np
import scipy.stats
from rl_coach.core_types import RunPhase, ActionType
from rl_coach.exploration_policies.exploration_policy import ContinuousActionExplorationPolicy, ExplorationParameters
@@ -31,8 +30,9 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace
class AdditiveNoiseParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise_percentage = 0.05
self.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise = 0.05
self.noise_as_percentage_from_action_space = True
@property
def path(self):
@@ -48,17 +48,19 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
2. Specified by the agents action. In case the agents action is a list with 2 values, the 1st one is assumed to
be the mean of the action, and 2nd is assumed to be its standard deviation.
"""
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float):
def __init__(self, action_space: ActionSpace, noise_schedule: Schedule,
evaluation_noise: float, noise_as_percentage_from_action_space: bool = True):
"""
:param action_space: the action space used by the environment
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
of the action space
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
:param noise_schedule: the schedule for the noise
:param evaluation_noise: the noise variance that will be used during evaluation phases
:param noise_as_percentage_from_action_space: a bool deciding whether the noise is absolute or as a percentage
from the action space
"""
super().__init__(action_space)
self.noise_percentage_schedule = noise_percentage_schedule
self.evaluation_noise_percentage = evaluation_noise_percentage
self.noise_schedule = noise_schedule
self.evaluation_noise = evaluation_noise
self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space
if not isinstance(action_space, BoxActionSpace):
raise ValueError("Additive noise exploration works only for continuous controls."
@@ -68,19 +70,20 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
raise ValueError("Additive noise exploration requires bounded actions")
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
def get_action(self, action_values: List[ActionType]) -> ActionType:
# TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies
# set the current noise percentage
# set the current noise
if self.phase == RunPhase.TEST:
current_noise_precentage = self.evaluation_noise_percentage
current_noise = self.evaluation_noise
else:
current_noise_precentage = self.noise_percentage_schedule.current_value
current_noise = self.noise_schedule.current_value
# scale the noise to the action space range
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
if self.noise_as_percentage_from_action_space:
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
else:
action_values_std = current_noise
# extract the mean values
if isinstance(action_values, list):
@@ -92,15 +95,18 @@ class AdditiveNoise(ContinuousActionExplorationPolicy):
# step the noise schedule
if self.phase is not RunPhase.TEST:
self.noise_percentage_schedule.step()
self.noise_schedule.step()
# the second element of the list is assumed to be the standard deviation
if isinstance(action_values, list) and len(action_values) > 1:
action_values_std = action_values[1].squeeze()
# add noise to the action means
action = np.random.normal(action_values_mean, action_values_std)
if self.phase is not RunPhase.TEST:
action = np.random.normal(action_values_mean, action_values_std)
else:
action = action_values_mean
return action
return np.atleast_1d(action)
def get_control_param(self):
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
return np.ones(self.action_space.shape)*self.noise_schedule.current_value

View File

@@ -32,7 +32,7 @@ class EGreedyParameters(ExplorationParameters):
self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
self.evaluation_epsilon = 0.05
self.continuous_exploration_policy_parameters = AdditiveNoiseParameters()
self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
# for continuous control -
# (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)

View File

@@ -28,10 +28,11 @@ from rl_coach.spaces import ActionSpace, BoxActionSpace
class TruncatedNormalParameters(ExplorationParameters):
def __init__(self):
super().__init__()
self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise_percentage = 0.05
self.noise_schedule = LinearSchedule(0.1, 0.1, 50000)
self.evaluation_noise = 0.05
self.clip_low = 0
self.clip_high = 1
self.noise_as_percentage_from_action_space = True
@property
def path(self):
@@ -49,17 +50,20 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
When the sampled action is outside of the action bounds given by the user, it is sampled again and again, until it
is within the bounds.
"""
def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
evaluation_noise_percentage: float, clip_low: float, clip_high: float):
def __init__(self, action_space: ActionSpace, noise_schedule: Schedule,
evaluation_noise: float, clip_low: float, clip_high: float,
noise_as_percentage_from_action_space: bool = True):
"""
:param action_space: the action space used by the environment
:param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
of the action space
:param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
:param noise_schedule: the schedule for the noise variance
:param evaluation_noise: the noise variance that will be used during evaluation phases
:param noise_as_percentage_from_action_space: whether to consider the noise as a percentage of the action space
or absolute value
"""
super().__init__(action_space)
self.noise_percentage_schedule = noise_percentage_schedule
self.evaluation_noise_percentage = evaluation_noise_percentage
self.noise_schedule = noise_schedule
self.evaluation_noise = evaluation_noise
self.noise_as_percentage_from_action_space = noise_as_percentage_from_action_space
self.clip_low = clip_low
self.clip_high = clip_high
@@ -71,17 +75,21 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
raise ValueError("Additive noise exploration requires bounded actions")
# TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
def get_action(self, action_values: List[ActionType]) -> ActionType:
# set the current noise percentage
# set the current noise
if self.phase == RunPhase.TEST:
current_noise_precentage = self.evaluation_noise_percentage
current_noise = self.evaluation_noise
else:
current_noise_precentage = self.noise_percentage_schedule.current_value
current_noise = self.noise_schedule.current_value
# scale the noise to the action space range
action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
if self.noise_as_percentage_from_action_space:
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
else:
action_values_std = current_noise
# scale the noise to the action space range
action_values_std = current_noise * (self.action_space.high - self.action_space.low)
# extract the mean values
if isinstance(action_values, list):
@@ -93,7 +101,7 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
# step the noise schedule
if self.phase is not RunPhase.TEST:
self.noise_percentage_schedule.step()
self.noise_schedule.step()
# the second element of the list is assumed to be the standard deviation
if isinstance(action_values, list) and len(action_values) > 1:
action_values_std = action_values[1].squeeze()
@@ -107,4 +115,4 @@ class TruncatedNormal(ContinuousActionExplorationPolicy):
return action
def get_control_param(self):
return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
return np.ones(self.action_space.shape)*self.noise_schedule.current_value

View File

@@ -123,8 +123,8 @@ agent_params.input_filter.add_observation_filter(
# no exploration is used
agent_params.exploration = AdditiveNoiseParameters()
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0)
agent_params.exploration.evaluation_noise_percentage = 0
agent_params.exploration.noise_schedule = ConstantSchedule(0)
agent_params.exploration.evaluation_noise = 0
# no playing during the training phase
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)

View File

@@ -53,7 +53,7 @@ env_params = GymVectorEnvironment(level='CartPole-v0')
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250
preset_validation_params.max_episodes_to_achieve_reward = 300
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
schedule_params=schedule_params, vis_params=VisualizationParameters(),

View File

@@ -87,9 +87,9 @@ agent_params.memory.shared_memory = True
agent_params.exploration = EGreedyParameters()
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.3)
agent_params.exploration.evaluation_epsilon = 0
# they actually take the noise_percentage_schedule to be 0.2 * max_abs_range which is 0.1 * total_range
agent_params.exploration.continuous_exploration_policy_parameters.noise_percentage_schedule = ConstantSchedule(0.1)
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise_percentage = 0
# they actually take the noise_schedule to be 0.2 * max_abs_range which is 0.1 * total_range
agent_params.exploration.continuous_exploration_policy_parameters.noise_schedule = ConstantSchedule(0.1)
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise = 0
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_observation_filter('observation', 'clipping', ObservationClippingFilter(-200, 200))

View File

@@ -15,7 +15,7 @@ schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(2000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(1000)
schedule_params.heatup_steps = EnvironmentSteps(10000)
#########
# Agent #
@@ -38,7 +38,7 @@ env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 400
preset_validation_params.max_episodes_to_achieve_reward = 1000
preset_validation_params.max_episodes_to_achieve_reward = 3000
preset_validation_params.reward_test_level = 'inverted_pendulum'
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']

View File

@@ -0,0 +1,49 @@
from rl_coach.agents.td3_agent import TD3AgentParameters
from rl_coach.architectures.layers import Dense
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.environment import SingleLevelSelection
from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(1000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(5000)
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
schedule_params.heatup_steps = EnvironmentSteps(10000)
#########
# Agent #
#########
agent_params = TD3AgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)]
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Empty
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(400), Dense(300)]
###############
# Environment #
###############
env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 500
preset_validation_params.max_episodes_to_achieve_reward = 1100
preset_validation_params.reward_test_level = 'hopper'
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
schedule_params=schedule_params, vis_params=VisualizationParameters(),
preset_validation_params=preset_validation_params)

View File

@@ -37,9 +37,9 @@ agent_params.network_wrappers['main'].input_embedders_parameters = {
}
agent_params.exploration = AdditiveNoiseParameters()
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0.05)
# agent_params.exploration.noise_percentage_schedule = LinearSchedule(0.4, 0.05, 100000)
agent_params.exploration.evaluation_noise_percentage = 0.05
agent_params.exploration.noise_schedule = ConstantSchedule(0.05)
# agent_params.exploration.noise_schedule = LinearSchedule(0.4, 0.05, 100000)
agent_params.exploration.evaluation_noise = 0.05
agent_params.network_wrappers['main'].batch_size = 64
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5

View File

@@ -53,10 +53,14 @@ if __name__ == "__main__":
"the preset name, followed by the environment level",
default='',
type=str)
parser.add_argument('-sd', '--level_as_sub_dir',
parser.add_argument('-lsd', '--level_as_sub_dir',
help="(flag) Store each level in it's own sub directory where the root directory name matches "
"the preset name",
action='store_true')
parser.add_argument('-ssd', '--seed_as_sub_dir',
help="(flag) Store each seed in it's own sub directory where the root directory name matches "
"the preset name",
action='store_true')
parser.add_argument('-ew', '--evaluation_worker',
help="(flag) Start an additional worker that will only do evaluation",
action='store_true')
@@ -108,6 +112,8 @@ if __name__ == "__main__":
command.append("-c")
if args.evaluation_worker:
command.append("-ew")
if args.seed_as_sub_dir:
seed = ''
if level is not None:
command.extend(['-lvl', '{}'.format(level)])
if level_as_sub_dir: