mirror of
https://github.com/gryf/coach.git
synced 2026-02-14 12:55:51 +01:00
ACER algorithm (#184)
* initial ACER commit * Code cleanup + several fixes * Q-retrace bug fix + small clean-ups * added documentation for acer * ACER benchmarks * update benchmarks table * Add nightly running of golden and trace tests. (#202) Resolves #200 * comment out nightly trace tests until values reset. * remove redundant observe ignore (#168) * ensure nightly test env containers exist. (#205) Also bump integration test timeout * wxPython removal (#207) Replacing wxPython with Python's Tkinter. Also removing the option to choose multiple files as it is unused and causes errors, and fixing the load file/directory spinner. * Create CONTRIBUTING.md (#210) * Create CONTRIBUTING.md. Resolves #188 * run nightly golden tests sequentially. (#217) Should reduce resource requirements and potential CPU contention but increases overall execution time. * tests: added new setup configuration + test args (#211) - added utils for future tests and conftest - added test args * new docs build * golden test update
This commit is contained in:
198
rl_coach/agents/acer_agent.py
Normal file
198
rl_coach/agents/acer_agent.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
|
||||
from rl_coach.architectures.head_parameters import ACERPolicyHeadParameters, QHeadParameters
|
||||
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters
|
||||
from rl_coach.core_types import Batch
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.utils import eps, last_sample
|
||||
|
||||
|
||||
class ACERAlgorithmParameters(AlgorithmParameters):
|
||||
"""
|
||||
:param num_steps_between_gradient_updates: (int)
|
||||
Every num_steps_between_gradient_updates transitions will be considered as a single batch and use for
|
||||
accumulating gradients. This is also the number of steps used for bootstrapping according to the n-step formulation.
|
||||
|
||||
:param ratio_of_replay: (int)
|
||||
The number of off-policy training iterations in each ACER iteration.
|
||||
|
||||
:param num_transitions_to_start_replay: (int)
|
||||
Number of environment steps until ACER starts to train off-policy from the experience replay.
|
||||
This emulates a heat-up phase where the agents learns only on-policy until there are enough transitions in
|
||||
the experience replay to start the off-policy training.
|
||||
|
||||
:param rate_for_copying_weights_to_target: (float)
|
||||
The rate of the exponential moving average for the average policy which is used for the trust region optimization.
|
||||
The target network in this algorithm is used as the average policy.
|
||||
|
||||
:param importance_weight_truncation: (float)
|
||||
The clipping constant for the importance weight truncation (not used in the Q-retrace calculation).
|
||||
|
||||
:param use_trust_region_optimization: (bool)
|
||||
If set to True, the gradients of the network will be modified with a term dependant on the KL divergence between
|
||||
the average policy and the current one, to bound the change of the policy during the network update.
|
||||
|
||||
:param max_KL_divergence: (float)
|
||||
The upper bound parameter for the trust region optimization, use_trust_region_optimization needs to be set true
|
||||
for this parameter to have an effect.
|
||||
|
||||
:param beta_entropy: (float)
|
||||
An entropy regulaization term can be added to the loss function in order to control exploration. This term
|
||||
is weighted using the beta value defined by beta_entropy.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.num_steps_between_gradient_updates = 5000
|
||||
self.ratio_of_replay = 4
|
||||
self.num_transitions_to_start_replay = 10000
|
||||
self.rate_for_copying_weights_to_target = 0.99
|
||||
self.importance_weight_truncation = 10.0
|
||||
self.use_trust_region_optimization = True
|
||||
self.max_KL_divergence = 1.0
|
||||
self.beta_entropy = 0
|
||||
|
||||
|
||||
class ACERNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [QHeadParameters(loss_weight=0.5), ACERPolicyHeadParameters(loss_weight=1.0)]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.async_training = True
|
||||
self.clip_gradients = 40.0
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class ACERAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=ACERAlgorithmParameters(),
|
||||
exploration={DiscreteActionSpace: CategoricalParameters()},
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": ACERNetworkParameters()})
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.acer_agent:ACERAgent'
|
||||
|
||||
|
||||
# Actor-Critic with Experience Replay - https://arxiv.org/abs/1611.01224
|
||||
class ACERAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
# signals definition
|
||||
self.q_loss = self.register_signal('Q Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
self.probability_loss = self.register_signal('Probability Loss')
|
||||
self.bias_correction_loss = self.register_signal('Bias Correction Loss')
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
self.V_Values = self.register_signal('Values')
|
||||
self.kl_divergence = self.register_signal('KL Divergence')
|
||||
|
||||
def _learn_from_batch(self, batch):
|
||||
|
||||
fetches = [self.networks['main'].online_network.output_heads[1].probability_loss,
|
||||
self.networks['main'].online_network.output_heads[1].bias_correction_loss,
|
||||
self.networks['main'].online_network.output_heads[1].kl_divergence]
|
||||
|
||||
# batch contains a list of transitions to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the values for the current states
|
||||
Q_values, policy_prob = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
avg_policy_prob = self.networks['main'].target_network.predict(batch.states(network_keys))[1]
|
||||
current_state_values = np.sum(policy_prob * Q_values, axis=1)
|
||||
|
||||
actions = batch.actions()
|
||||
num_transitions = batch.size
|
||||
Q_head_targets = Q_values
|
||||
|
||||
Q_i = Q_values[np.arange(num_transitions), actions]
|
||||
|
||||
mu = batch.info('all_action_probabilities')
|
||||
rho = policy_prob / (mu + eps)
|
||||
rho_i = rho[np.arange(batch.size), actions]
|
||||
|
||||
rho_bar = np.minimum(1.0, rho_i)
|
||||
|
||||
if batch.game_overs()[-1]:
|
||||
Qret = 0
|
||||
else:
|
||||
result = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))
|
||||
Qret = np.sum(result[0] * result[1], axis=1)[0]
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
Qret = batch.rewards()[i] + self.ap.algorithm.discount * Qret
|
||||
Q_head_targets[i, actions[i]] = Qret
|
||||
Qret = rho_bar[i] * (Qret - Q_i[i]) + current_state_values[i]
|
||||
|
||||
Q_retrace = Q_head_targets[np.arange(num_transitions), actions]
|
||||
|
||||
# train
|
||||
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
|
||||
'output_1_0': actions,
|
||||
'output_1_1': rho,
|
||||
'output_1_2': rho_i,
|
||||
'output_1_3': Q_values,
|
||||
'output_1_4': Q_retrace,
|
||||
'output_1_5': avg_policy_prob},
|
||||
[Q_head_targets, current_state_values],
|
||||
additional_fetches=fetches)
|
||||
|
||||
for network in self.networks.values():
|
||||
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads, fetch_result = result[:4]
|
||||
self.q_loss.add_sample(losses[0])
|
||||
self.policy_loss.add_sample(losses[1])
|
||||
self.probability_loss.add_sample(fetch_result[0])
|
||||
self.bias_correction_loss.add_sample(fetch_result[1])
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
self.V_Values.add_sample(current_state_values)
|
||||
self.kl_divergence.add_sample(fetch_result[2])
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# perform on-policy training iteration
|
||||
total_loss, losses, unclipped_grads = self._learn_from_batch(batch)
|
||||
|
||||
if self.ap.algorithm.ratio_of_replay > 0 \
|
||||
and self.memory.num_transitions() > self.ap.algorithm.num_transitions_to_start_replay:
|
||||
n = np.random.poisson(self.ap.algorithm.ratio_of_replay)
|
||||
# perform n off-policy training iterations
|
||||
for _ in range(n):
|
||||
new_batch = Batch(self.call_memory('sample', (self.ap.algorithm.num_steps_between_gradient_updates, True)))
|
||||
result = self._learn_from_batch(new_batch)
|
||||
total_loss += result[0]
|
||||
losses += result[1]
|
||||
unclipped_grads += result[2]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "main")
|
||||
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value
|
||||
@@ -1026,7 +1026,7 @@ class Agent(AgentInterface):
|
||||
"""
|
||||
Collect all of agent's network savers
|
||||
:param parent_path_suffix: path suffix of the parent of the agent
|
||||
(could be name of level manager or composite agent)
|
||||
(could be name of level manager or composite agent)
|
||||
:return: collection of all agent savers
|
||||
"""
|
||||
parent_path_suffix = "{}.{}".format(parent_path_suffix, self.name)
|
||||
|
||||
@@ -149,7 +149,7 @@ class PolicyOptimizationAgent(Agent):
|
||||
action_probabilities = np.array(action_values).squeeze()
|
||||
action = self.exploration_policy.get_action(action_probabilities)
|
||||
action_info = ActionInfo(action=action,
|
||||
action_probability=action_probabilities[action])
|
||||
all_action_probabilities=action_probabilities)
|
||||
|
||||
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
|
||||
elif isinstance(self.spaces.action, BoxActionSpace):
|
||||
|
||||
@@ -176,3 +176,13 @@ class RainbowQHeadParameters(HeadParameters):
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
|
||||
|
||||
class ACERPolicyHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='acer_policy_head_params',
|
||||
num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0,
|
||||
loss_weight: float = 1.0, dense_layer=None):
|
||||
super().__init__(parameterized_class_name="ACERPolicyHead", activation_function=activation_function, name=name,
|
||||
dense_layer=dense_layer, num_output_head_copies=num_output_head_copies,
|
||||
rescale_gradient_from_head_by_factor=rescale_gradient_from_head_by_factor,
|
||||
loss_weight=loss_weight)
|
||||
|
||||
@@ -350,7 +350,7 @@ class TensorFlowArchitecture(Architecture):
|
||||
importance_weight = np.ones(target_ph.shape[0])
|
||||
else:
|
||||
importance_weight = importance_weights[placeholder_idx]
|
||||
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
|
||||
importance_weight = np.reshape(importance_weight, (-1,) + (1,) * (len(target_ph.shape) - 1))
|
||||
|
||||
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ from .q_head import QHead
|
||||
from .quantile_regression_q_head import QuantileRegressionQHead
|
||||
from .rainbow_q_head import RainbowQHead
|
||||
from .v_head import VHead
|
||||
from .acer_policy_head import ACERPolicyHead
|
||||
|
||||
__all__ = [
|
||||
'CategoricalQHead',
|
||||
@@ -25,5 +26,6 @@ __all__ = [
|
||||
'QHead',
|
||||
'QuantileRegressionQHead',
|
||||
'RainbowQHead',
|
||||
'VHead'
|
||||
'VHead',
|
||||
'ACERPolicyHead'
|
||||
]
|
||||
|
||||
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.layers import Dense
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import eps
|
||||
|
||||
|
||||
class ACERPolicyHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu',
|
||||
dense_layer=Dense):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function,
|
||||
dense_layer=dense_layer)
|
||||
self.name = 'acer_policy_head'
|
||||
self.return_type = ActionProbabilities
|
||||
self.beta = None
|
||||
self.action_penalty = None
|
||||
|
||||
# a scalar weight that penalizes low entropy values to encourage exploration
|
||||
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
|
||||
# we set the beta value as a tf variable so it can be updated later if needed
|
||||
self.beta = tf.Variable(float(agent_parameters.algorithm.beta_entropy),
|
||||
trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
self.beta_placeholder = tf.placeholder('float')
|
||||
self.set_beta = tf.assign(self.beta, self.beta_placeholder)
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
# create a discrete action network (softmax probabilities output)
|
||||
self._build_discrete_net(input_layer, self.spaces.action)
|
||||
else:
|
||||
raise ValueError("only discrete action spaces are supported for ACER")
|
||||
|
||||
if self.is_local:
|
||||
# add entropy regularization
|
||||
if self.beta:
|
||||
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
|
||||
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
# Truncated importance sampling with bias corrections
|
||||
importance_sampling_weight = tf.placeholder(tf.float32, [None, self.num_actions],
|
||||
name='{}_importance_sampling_weight'.format(self.get_name()))
|
||||
self.input.append(importance_sampling_weight)
|
||||
importance_sampling_weight_i = tf.placeholder(tf.float32, [None],
|
||||
name='{}_importance_sampling_weight_i'.format(self.get_name()))
|
||||
self.input.append(importance_sampling_weight_i)
|
||||
|
||||
V_values = tf.placeholder(tf.float32, [None], name='{}_V_values'.format(self.get_name()))
|
||||
self.target.append(V_values)
|
||||
Q_values = tf.placeholder(tf.float32, [None, self.num_actions], name='{}_Q_values'.format(self.get_name()))
|
||||
self.input.append(Q_values)
|
||||
Q_retrace = tf.placeholder(tf.float32, [None], name='{}_Q_retrace'.format(self.get_name()))
|
||||
self.input.append(Q_retrace)
|
||||
|
||||
action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
|
||||
self.probability_loss = -tf.reduce_mean(action_log_probs_wrt_policy
|
||||
* (Q_retrace - V_values)
|
||||
* tf.minimum(self.ap.algorithm.importance_weight_truncation,
|
||||
importance_sampling_weight_i))
|
||||
|
||||
log_probs_wrt_policy = tf.log(self.policy_probs + eps)
|
||||
bias_correction_gain = tf.reduce_sum(log_probs_wrt_policy
|
||||
* (Q_values - tf.expand_dims(V_values, 1))
|
||||
* tf.nn.relu(1.0 - (self.ap.algorithm.importance_weight_truncation
|
||||
/ (importance_sampling_weight + eps)))
|
||||
* tf.stop_gradient(self.policy_probs),
|
||||
axis=1)
|
||||
self.bias_correction_loss = -tf.reduce_mean(bias_correction_gain)
|
||||
|
||||
self.loss = self.probability_loss + self.bias_correction_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
|
||||
# Trust region
|
||||
batch_size = tf.to_float(tf.shape(input_layer)[0])
|
||||
average_policy = tf.placeholder(tf.float32, [None, self.num_actions],
|
||||
name='{}_average_policy'.format(self.get_name()))
|
||||
self.input.append(average_policy)
|
||||
average_policy_distribution = tf.contrib.distributions.Categorical(probs=(average_policy + eps))
|
||||
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(average_policy_distribution,
|
||||
self.policy_distribution))
|
||||
if self.ap.algorithm.use_trust_region_optimization:
|
||||
@tf.custom_gradient
|
||||
def trust_region_layer(x):
|
||||
def grad(g):
|
||||
g = - g * batch_size
|
||||
k = - average_policy / (self.policy_probs + eps)
|
||||
adj = tf.nn.relu(
|
||||
(tf.reduce_sum(k * g, axis=1) - self.ap.algorithm.max_KL_divergence)
|
||||
/ (tf.reduce_sum(tf.square(k), axis=1) + eps))
|
||||
g = g - tf.expand_dims(adj, 1) * k
|
||||
return - g / batch_size
|
||||
return tf.identity(x), grad
|
||||
self.output = trust_region_layer(self.output)
|
||||
|
||||
def _build_discrete_net(self, input_layer, action_space):
|
||||
self.num_actions = len(action_space.actions)
|
||||
self.actions = tf.placeholder(tf.int32, [None], name='{}_actions'.format(self.get_name()))
|
||||
self.input.append(self.actions)
|
||||
|
||||
policy_values = self.dense_layer(self.num_actions)(input_layer, name='fc')
|
||||
self.policy_probs = tf.nn.softmax(policy_values, name='{}_policy'.format(self.get_name()))
|
||||
|
||||
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
|
||||
self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
|
||||
self.output = self.policy_probs
|
||||
@@ -329,12 +329,12 @@ class ActionInfo(object):
|
||||
Action info is a class that holds an action and various additional information details about it
|
||||
"""
|
||||
|
||||
def __init__(self, action: ActionType, action_probability: float=0,
|
||||
def __init__(self, action: ActionType, all_action_probabilities: float=0,
|
||||
action_value: float=0., state_value: float=0., max_action_value: float=None,
|
||||
action_intrinsic_reward: float=0):
|
||||
"""
|
||||
:param action: the action
|
||||
:param action_probability: the probability that the action was given when selecting it
|
||||
:param all_action_probabilities: the probability that the action was given when selecting it
|
||||
:param action_value: the state-action value (Q value) of the action
|
||||
:param state_value: the state value (V value) of the state where the action was taken
|
||||
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action
|
||||
@@ -344,7 +344,7 @@ class ActionInfo(object):
|
||||
selection
|
||||
"""
|
||||
self.action = action
|
||||
self.action_probability = action_probability
|
||||
self.all_action_probabilities = all_action_probabilities
|
||||
self.action_value = action_value
|
||||
self.state_value = state_value
|
||||
if not max_action_value:
|
||||
|
||||
@@ -75,18 +75,27 @@ class EpisodicExperienceReplay(Memory):
|
||||
def num_transitions_in_complete_episodes(self):
|
||||
return self._num_transitions_in_complete_episodes
|
||||
|
||||
def sample(self, size: int) -> List[Transition]:
|
||||
def sample(self, size: int, is_consecutive_transitions=False) -> List[Transition]:
|
||||
"""
|
||||
Sample a batch of transitions form the replay buffer. If the requested size is larger than the number
|
||||
Sample a batch of transitions from the replay buffer. If the requested size is larger than the number
|
||||
of samples available in the replay buffer then the batch will return empty.
|
||||
:param size: the size of the batch to sample
|
||||
:param is_consecutive_transitions: if set True, samples a batch of consecutive transitions.
|
||||
:return: a batch (list) of selected transitions from the replay buffer
|
||||
"""
|
||||
self.reader_writer_lock.lock_writing()
|
||||
|
||||
if self.num_complete_episodes() >= 1:
|
||||
transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
|
||||
batch = [self.transitions[i] for i in transitions_idx]
|
||||
if is_consecutive_transitions:
|
||||
episode_idx = np.random.randint(0, self.num_complete_episodes())
|
||||
if self._buffer[episode_idx].length() <= size:
|
||||
batch = self._buffer[episode_idx].transitions
|
||||
else:
|
||||
transition_idx = np.random.randint(size, self._buffer[episode_idx].length())
|
||||
batch = self._buffer[episode_idx].transitions[transition_idx-size:transition_idx]
|
||||
else:
|
||||
transitions_idx = np.random.randint(self.num_transitions_in_complete_episodes(), size=size)
|
||||
batch = [self.transitions[i] for i in transitions_idx]
|
||||
|
||||
else:
|
||||
raise ValueError("The episodic replay buffer cannot be sampled since there are no complete episodes yet. "
|
||||
|
||||
45
rl_coach/presets/Atari_ACER.py
Normal file
45
rl_coach/presets/Atari_ACER.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from rl_coach.agents.acer_agent import ACERAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.environments.environment import SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ACERAgentParameters()
|
||||
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20
|
||||
agent_params.algorithm.ratio_of_replay = 4
|
||||
agent_params.algorithm.num_transitions_to_start_replay = 10000
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.algorithm.beta_entropy = 0.05
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters(),
|
||||
preset_validation_params=preset_validation_params)
|
||||
49
rl_coach/presets/CartPole_ACER.py
Normal file
49
rl_coach/presets/CartPole_ACER.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from rl_coach.agents.acer_agent import ACERAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.environments.gym_environment import GymVectorEnvironment
|
||||
from rl_coach.filters.filter import InputFilter
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ACERAgentParameters()
|
||||
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 5
|
||||
agent_params.algorithm.ratio_of_replay = 4
|
||||
agent_params.algorithm.num_transitions_to_start_replay = 1000
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)
|
||||
agent_params.input_filter = InputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
|
||||
agent_params.algorithm.beta_entropy = 0.0
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = GymVectorEnvironment(level='CartPole-v0')
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 300
|
||||
preset_validation_params.num_workers = 1
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters(),
|
||||
preset_validation_params=preset_validation_params)
|
||||
55
rl_coach/presets/Doom_Basic_ACER.py
Normal file
55
rl_coach/presets/Doom_Basic_ACER.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from rl_coach.agents.acer_agent import ACERAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.filters.filter import InputFilter
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ACERAgentParameters()
|
||||
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 30
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.algorithm.ratio_of_replay = 4
|
||||
agent_params.algorithm.num_transitions_to_start_replay = 2000
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 100000)
|
||||
agent_params.input_filter = InputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/100.))
|
||||
agent_params.algorithm.beta_entropy = 0.01
|
||||
agent_params.network_wrappers['main'].clip_gradients = 40.
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters(level='basic')
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 20
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 400
|
||||
preset_validation_params.num_workers = 8
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters(),
|
||||
preset_validation_params=preset_validation_params)
|
||||
@@ -403,7 +403,8 @@ class DiscreteActionSpace(ActionSpace):
|
||||
return np.random.choice(self.actions)
|
||||
|
||||
def sample_with_info(self) -> ActionInfo:
|
||||
return ActionInfo(self.sample(), action_probability=1. / (self.high[0] - self.low[0] + 1))
|
||||
return ActionInfo(self.sample(),
|
||||
all_action_probabilities=np.full(len(self.actions), 1. / (self.high[0] - self.low[0] + 1)))
|
||||
|
||||
def get_description(self, action: int) -> str:
|
||||
if type(self.descriptions) == list and 0 <= action < len(self.descriptions):
|
||||
@@ -450,7 +451,7 @@ class MultiSelectActionSpace(ActionSpace):
|
||||
return random.choice(self.actions)
|
||||
|
||||
def sample_with_info(self) -> ActionInfo:
|
||||
return ActionInfo(self.sample(), action_probability=1. / len(self.actions))
|
||||
return ActionInfo(self.sample(), all_action_probabilities=np.full(len(self.actions), 1. / len(self.actions)))
|
||||
|
||||
def get_description(self, action: np.ndarray) -> str:
|
||||
if np.sum(len(np.where(action == 0)[0])) + np.sum(len(np.where(action == 1)[0])) != self.shape or \
|
||||
|
||||
@@ -14,7 +14,7 @@ def test_discrete():
|
||||
for i in range(100):
|
||||
assert 3 > action_space.sample() >= 0
|
||||
action_info = action_space.sample_with_info()
|
||||
assert action_info.action_probability == 1. / 3
|
||||
assert action_info.all_action_probabilities[0] == 1. / 3
|
||||
assert action_space.high == 2
|
||||
assert action_space.low == 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user