1
0
mirror of https://github.com/gryf/coach.git synced 2026-02-14 12:55:51 +01:00

ACER algorithm (#184)

* initial ACER commit

* Code cleanup + several fixes

* Q-retrace bug fix + small clean-ups

* added documentation for acer

* ACER benchmarks

* update benchmarks table

* Add nightly running of golden and trace tests. (#202)

Resolves #200

* comment out nightly trace tests until values reset.

* remove redundant observe ignore (#168)

* ensure nightly test env containers exist. (#205)

Also bump integration test timeout

* wxPython removal (#207)

Replacing wxPython with Python's Tkinter.
Also removing the option to choose multiple files as it is unused and causes errors, and fixing the load file/directory spinner.

* Create CONTRIBUTING.md (#210)

* Create CONTRIBUTING.md.  Resolves #188

* run nightly golden tests sequentially. (#217)

Should reduce resource requirements and potential CPU contention but increases
overall execution time.

* tests: added new setup configuration + test args (#211)

- added utils for future tests and conftest
- added test args

* new docs build

* golden test update
This commit is contained in:
shadiendrawis
2019-02-20 23:52:34 +02:00
committed by GitHub
parent 7253f511ed
commit 2b5d1dabe6
175 changed files with 2327 additions and 664 deletions

View File

@@ -0,0 +1,198 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.architectures.head_parameters import ACERPolicyHeadParameters, QHeadParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters
from rl_coach.core_types import Batch
from rl_coach.exploration_policies.categorical import CategoricalParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import eps, last_sample
class ACERAlgorithmParameters(AlgorithmParameters):
"""
:param num_steps_between_gradient_updates: (int)
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
:param ratio_of_replay: (int)
The number of off-policy training iterations in each ACER iteration.
:param num_transitions_to_start_replay: (int)
Number of environment steps until ACER starts to train off-policy from the experience replay.
This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
the experience replay to start the off-policy training.
:param rate_for_copying_weights_to_target: (float)
The rate of the exponential moving average for the average policy which is used for the trust region optimization.
The target network in this algorithm is used as the average policy.
:param importance_weight_truncation: (float)
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).
:param use_trust_region_optimization: (bool)
If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
the average policy and the current one, to bound the change of the policy during the network update.
:param max_KL_divergence: (float)
The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
for this parameter to have an effect.
:param beta_entropy: (float)
An entropy regulaization term can be added to the loss function in order to control exploration. This term
is weighted using the beta value defined by beta_entropy.
"""
def __init__(self):
super().__init__()
self.apply_gradients_every_x_episodes = 5
self.num_steps_between_gradient_updates = 5000
self.ratio_of_replay = 4
self.num_transitions_to_start_replay = 10000
self.rate_for_copying_weights_to_target = 0.99
self.importance_weight_truncation = 10.0
self.use_trust_region_optimization = True
self.max_KL_divergence = 1.0
self.beta_entropy = 0
class ACERNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)]
self.optimizer_type = 'Adam'
self.async_training = True
self.clip_gradients = 40.0
self.create_target_network = True
class ACERAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ACERAlgorithmParameters(),
exploration={DiscreteActionSpace: CategoricalParameters()},
memory=EpisodicExperienceReplayParameters(),
networks={"main": ACERNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.acer_agent:ACERAgent'
# Actor-Critic with Experience Replay - https://arxiv.org/abs/1611.01224
class ACERAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.q_loss = self.register_signal('Q Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.probability_loss = self.register_signal('Probability Loss')
self.bias_correction_loss = self.register_signal('Bias Correction Loss')
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.V_Values = self.register_signal('Values')
self.kl_divergence = self.register_signal('KL Divergence')
def _learn_from_batch(self, batch):
fetches = [self.networks['main'].online_network.output_heads[1].probability_loss,
self.networks['main'].online_network.output_heads[1].bias_correction_loss,
self.networks['main'].online_network.output_heads[1].kl_divergence]
# batch contains a list of transitions to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
Q_values, policy_prob = self.networks['main'].online_network.predict(batch.states(network_keys))
avg_policy_prob = self.networks['main'].target_network.predict(batch.states(network_keys))[1]
current_state_values = np.sum(policy_prob * Q_values, axis=1)
actions = batch.actions()
num_transitions = batch.size
Q_head_targets = Q_values
Q_i = Q_values[np.arange(num_transitions), actions]
mu = batch.info('all_action_probabilities')
rho = policy_prob / (mu + eps)
rho_i = rho[np.arange(batch.size), actions]
rho_bar = np.minimum(1.0, rho_i)
if batch.game_overs()[-1]:
Qret = 0
else:
result = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))
Qret = np.sum(result[0] * result[1], axis=1)[0]
for i in reversed(range(num_transitions)):
Qret = batch.rewards()[i] + self.ap.algorithm.discount * Qret
Q_head_targets[i, actions[i]] = Qret
Qret = rho_bar[i] * (Qret - Q_i[i]) + current_state_values[i]
Q_retrace = Q_head_targets[np.arange(num_transitions), actions]
# train
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_1_0': actions,
'output_1_1': rho,
'output_1_2': rho_i,
'output_1_3': Q_values,
'output_1_4': Q_retrace,
'output_1_5': avg_policy_prob},
[Q_head_targets, current_state_values],
additional_fetches=fetches)
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
# logging
total_loss, losses, unclipped_grads, fetch_result = result[:4]
self.q_loss.add_sample(losses[0])
self.policy_loss.add_sample(losses[1])
self.probability_loss.add_sample(fetch_result[0])
self.bias_correction_loss.add_sample(fetch_result[1])
self.unclipped_grads.add_sample(unclipped_grads)
self.V_Values.add_sample(current_state_values)
self.kl_divergence.add_sample(fetch_result[2])
return total_loss, losses, unclipped_grads
def learn_from_batch(self, batch):
# perform on-policy training iteration
total_loss, losses, unclipped_grads = self._learn_from_batch(batch)
if self.ap.algorithm.ratio_of_replay > 0 \
and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay:
n = np.random.poisson(self.ap.algorithm.ratio_of_replay)
# perform n off-policy training iterations
for _ in range(n):
new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True)))
result = self._learn_from_batch(new_batch)
total_loss += result[0]
losses += result[1]
unclipped_grads += result[2]
return total_loss, losses, unclipped_grads
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value

View File

@@ -1026,7 +1026,7 @@ class Agent(AgentInterface):
"""
Collect all of agent's network savers
:param parent_path_suffix: path suffix of the parent of the agent
(could be name of level manager or composite agent)
(could be name of level manager or composite agent)
:return: collection of all agent savers
"""
parent_path_suffix = "{}.{}".format(parent_path_suffix, self.name)

View File

@@ -149,7 +149,7 @@ class PolicyOptimizationAgent(Agent):
action_probabilities = np.array(action_values).squeeze()
action = self.exploration_policy.get_action(action_probabilities)
action_info = ActionInfo(action=action,
action_probability=action_probabilities[action])
all_action_probabilities=action_probabilities)
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
elif isinstance(self.spaces.action, BoxActionSpace):

View File

@@ -176,3 +176,13 @@ class RainbowQHeadParameters(HeadParameters):
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)
class ACERPolicyHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='acer_policy_head_params',
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
loss_weight: float = 1.0, dense_layer=None):
super().__init__(parameterized_class_name="ACERPolicyHead", activation_function=activation_function, name=name,
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
loss_weight=loss_weight)

View File

@@ -350,7 +350,7 @@ class TensorFlowArchitecture(Architecture):
importance_weight = np.ones(target_ph.shape[0])
else:
importance_weight = importance_weights[placeholder_idx]
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
importance_weight = np.reshape(importance_weight, (-1,) + (1,) * (len(target_ph.shape) - 1))
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight

View File

@@ -11,6 +11,7 @@ from .q_head import QHead
from .quantile_regression_q_head import QuantileRegressionQHead
from .rainbow_q_head import RainbowQHead
from .v_head import VHead
from .acer_policy_head import ACERPolicyHead
__all__ = [
'CategoricalQHead',
@@ -25,5 +26,6 @@ __all__ = [
'QHead',
'QuantileRegressionQHead',
'RainbowQHead',
'VHead'
'VHead',
'ACERPolicyHead'
]

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.layers import Dense
from rl_coach.architectures.tensorflow_components.heads.head import Head
from rl_coach.base_parameters import AgentParameters
from rl_coach.core_types import ActionProbabilities
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
class ACERPolicyHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
dense_layer=Dense):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
dense_layer=dense_layer)
self.name = 'acer_policy_head'
self.return_type = ActionProbabilities
self.beta = None
self.action_penalty = None
# a scalar weight that penalizes low entropy values to encourage exploration
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
# we set the beta value as a tf variable so it can be updated later if needed
self.beta = tf.Variable(float(agent_parameters.algorithm.beta_entropy),
trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
self.beta_placeholder = tf.placeholder('float')
self.set_beta = tf.assign(self.beta, self.beta_placeholder)
def _build_module(self, input_layer):
if isinstance(self.spaces.action, DiscreteActionSpace):
# create a discrete action network (softmax probabilities output)
self._build_discrete_net(input_layer, self.spaces.action)
else:
raise ValueError("only discrete action spaces are supported for ACER")
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# Truncated importance sampling with bias corrections
importance_sampling_weight = tf.placeholder(tf.float32, [None, self.num_actions],
name='{}_importance_sampling_weight'.format(self.get_name()))
self.input.append(importance_sampling_weight)
importance_sampling_weight_i = tf.placeholder(tf.float32, [None],
name='{}_importance_sampling_weight_i'.format(self.get_name()))
self.input.append(importance_sampling_weight_i)
V_values = tf.placeholder(tf.float32, [None], name='{}_V_values'.format(self.get_name()))
self.target.append(V_values)
Q_values = tf.placeholder(tf.float32, [None, self.num_actions], name='{}_Q_values'.format(self.get_name()))
self.input.append(Q_values)
Q_retrace = tf.placeholder(tf.float32, [None], name='{}_Q_retrace'.format(self.get_name()))
self.input.append(Q_retrace)
action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
self.probability_loss = -tf.reduce_mean(action_log_probs_wrt_policy
* (Q_retrace - V_values)
* tf.minimum(self.ap.algorithm.importance_weight_truncation,
importance_sampling_weight_i))
log_probs_wrt_policy = tf.log(self.policy_probs + eps)
bias_correction_gain = tf.reduce_sum(log_probs_wrt_policy
* (Q_values - tf.expand_dims(V_values, 1))
* tf.nn.relu(1.0 - (self.ap.algorithm.importance_weight_truncation
/ (importance_sampling_weight + eps)))
* tf.stop_gradient(self.policy_probs),
axis=1)
self.bias_correction_loss = -tf.reduce_mean(bias_correction_gain)
self.loss = self.probability_loss + self.bias_correction_loss
tf.losses.add_loss(self.loss)
# Trust region
batch_size = tf.to_float(tf.shape(input_layer)[0])
average_policy = tf.placeholder(tf.float32, [None, self.num_actions],
name='{}_average_policy'.format(self.get_name()))
self.input.append(average_policy)
average_policy_distribution = tf.contrib.distributions.Categorical(probs=(average_policy + eps))
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(average_policy_distribution,
self.policy_distribution))
if self.ap.algorithm.use_trust_region_optimization:
@tf.custom_gradient
def trust_region_layer(x):
def grad(g):
g = - g * batch_size
k = - average_policy / (self.policy_probs + eps)
adj = tf.nn.relu(
(tf.reduce_sum(k * g, axis=1) - self.ap.algorithm.max_KL_divergence)
/ (tf.reduce_sum(tf.square(k), axis=1) + eps))
g = g - tf.expand_dims(adj, 1) * k
return - g / batch_size
return tf.identity(x), grad
self.output = trust_region_layer(self.output)
def _build_discrete_net(self, input_layer, action_space):
self.num_actions = len(action_space.actions)
self.actions = tf.placeholder(tf.int32, [None], name='{}_actions'.format(self.get_name()))
self.input.append(self.actions)
policy_values = self.dense_layer(self.num_actions)(input_layer, name='fc')
self.policy_probs = tf.nn.softmax(policy_values, name='{}_policy'.format(self.get_name()))
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
self.output = self.policy_probs

View File

@@ -329,12 +329,12 @@ class ActionInfo(object):
Action info is a class that holds an action and various additional information details about it
"""
def __init__(self, action: ActionType, action_probability: float=0,
def __init__(self, action: ActionType, all_action_probabilities: float=0,
action_value: float=0., state_value: float=0., max_action_value: float=None,
action_intrinsic_reward: float=0):
"""
:param action: the action
:param action_probability: the probability that the action was given when selecting it
:param all_action_probabilities: the probability that the action was given when selecting it
:param action_value: the state-action value (Q value) of the action
:param state_value: the state value (V value) of the state where the action was taken
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action
@@ -344,7 +344,7 @@ class ActionInfo(object):
selection
"""
self.action = action
self.action_probability = action_probability
self.all_action_probabilities = all_action_probabilities
self.action_value = action_value
self.state_value = state_value
if not max_action_value:

View File

@@ -75,18 +75,27 @@ class EpisodicExperienceReplay(Memory):
def num_transitions_in_complete_episodes(self):
return self._num_transitions_in_complete_episodes
def sample(self, size: int) -> List[Transition]:
def sample(self, size: int, is_consecutive_transitions=False) -> List[Transition]:
"""
Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
of samples available in the replay buffer then the batch will return empty.
:param size: the size of the batch to sample
:param is_consecutive_transitions: if set True, samples a batch of consecutive transitions.
:return: a batch (list) of selected transitions from the replay buffer
"""
self.reader_writer_lock.lock_writing()
if self.num_complete_episodes() >= 1:
transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
batch = [self.transitions[i] for i in transitions_idx]
if is_consecutive_transitions:
episode_idx = np.random.randint(0, self.num_complete_episodes())
if self._buffer[episode_idx].length() <= size:
batch = self._buffer[episode_idx].transitions
else:
transition_idx = np.random.randint(size, self._buffer[episode_idx].length())
batch = self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]
else:
transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
batch = [self.transitions[i] for i in transitions_idx]
else:
raise ValueError("The episodic replay buffer cannot be sampled since there are no complete episodes yet. "

View File

@@ -0,0 +1,45 @@
from rl_coach.agents.acer_agent import ACERAgentParameters
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.environment import SingleLevelSelection
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.memories.memory import MemoryGranularity
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
schedule_params.heatup_steps = EnvironmentSteps(0)
#########
# Agent #
#########
agent_params = ACERAgentParameters()
agent_params.algorithm.apply_gradients_every_x_episodes = 1
agent_params.algorithm.num_steps_between_gradient_updates = 20
agent_params.algorithm.ratio_of_replay = 4
agent_params.algorithm.num_transitions_to_start_replay = 10000
agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.algorithm.beta_entropy = 0.05
###############
# Environment #
###############
env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
schedule_params=schedule_params, vis_params=VisualizationParameters(),
preset_validation_params=preset_validation_params)

View File

@@ -0,0 +1,49 @@
from rl_coach.agents.acer_agent import ACERAgentParameters
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.gym_environment import GymVectorEnvironment
from rl_coach.filters.filter import InputFilter
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.memories.memory import MemoryGranularity
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)
#########
# Agent #
#########
agent_params = ACERAgentParameters()
agent_params.algorithm.num_steps_between_gradient_updates = 5
agent_params.algorithm.ratio_of_replay = 4
agent_params.algorithm.num_transitions_to_start_replay = 1000
agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
agent_params.algorithm.beta_entropy = 0.0
###############
# Environment #
###############
env_params = GymVectorEnvironment(level='CartPole-v0')
########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 300
preset_validation_params.num_workers = 1
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
schedule_params=schedule_params, vis_params=VisualizationParameters(),
preset_validation_params=preset_validation_params)

View File

@@ -0,0 +1,55 @@
from rl_coach.agents.acer_agent import ACERAgentParameters
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.filters.filter import InputFilter
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)
#########
# Agent #
#########
agent_params = ACERAgentParameters()
agent_params.algorithm.num_steps_between_gradient_updates = 30
agent_params.algorithm.apply_gradients_every_x_episodes = 1
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.algorithm.ratio_of_replay = 4
agent_params.algorithm.num_transitions_to_start_replay = 2000
agent_params.memory.max_size = (MemoryGranularity.Transitions, 100000)
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/100.))
agent_params.algorithm.beta_entropy = 0.01
agent_params.network_wrappers['main'].clip_gradients = 40.
###############
# Environment #
###############
env_params = DoomEnvironmentParameters(level='basic')
########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 20
preset_validation_params.max_episodes_to_achieve_reward = 400
preset_validation_params.num_workers = 8
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
schedule_params=schedule_params, vis_params=VisualizationParameters(),
preset_validation_params=preset_validation_params)

View File

@@ -403,7 +403,8 @@ class DiscreteActionSpace(ActionSpace):
return np.random.choice(self.actions)
def sample_with_info(self) -> ActionInfo:
return ActionInfo(self.sample(), action_probability=1. / (self.high[0] - self.low[0] + 1))
return ActionInfo(self.sample(),
all_action_probabilities=np.full(len(self.actions), 1. / (self.high[0] - self.low[0] + 1)))
def get_description(self, action: int) -> str:
if type(self.descriptions) == list and 0 <= action < len(self.descriptions):
@@ -450,7 +451,7 @@ class MultiSelectActionSpace(ActionSpace):
return random.choice(self.actions)
def sample_with_info(self) -> ActionInfo:
return ActionInfo(self.sample(), action_probability=1. / len(self.actions))
return ActionInfo(self.sample(), all_action_probabilities=np.full(len(self.actions), 1. / len(self.actions)))
def get_description(self, action: np.ndarray) -> str:
if np.sum(len(np.where(action == 0)[0])) + np.sum(len(np.where(action == 1)[0])) != self.shape or \

View File

@@ -14,7 +14,7 @@ def test_discrete():
for i in range(100):
assert 3 > action_space.sample() >= 0
action_info = action_space.sample_with_info()
assert action_info.action_probability == 1. / 3
assert action_info.all_action_probabilities[0] == 1. / 3
assert action_space.high == 2
assert action_space.low == 0