1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 03:30:19 +01:00

pre-release 0.10.0

This commit is contained in:
Gal Novik
2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions

View File

@@ -0,0 +1,15 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,165 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
import scipy.signal
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.core_types import QActionStateValue
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import last_sample
from rl_coach.logger import screen
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class ActorCriticAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.A_VALUE
self.apply_gradients_every_x_episodes = 5
self.beta_entropy = 0
self.num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
self.gae_lambda = 0.96
self.estimate_state_value_using_gae = False
class ActorCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [VHeadParameters(), PolicyHeadParameters()]
self.loss_weights = [0.5, 1.0]
self.rescale_gradient_from_head_by_factor = [1, 1]
self.optimizer_type = 'Adam'
self.clip_gradients = 40.0
self.async_training = True
class ActorCriticAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ActorCriticAlgorithmParameters(),
exploration=None, #TODO this should be different for continuous (ContinuousEntropyExploration)
# and discrete (CategoricalExploration) action spaces.
memory=SingleEpisodeBufferParameters(),
networks={"main": ActorCriticNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.actor_critic_agent:ActorCriticAgent'
# Actor Critic - https://arxiv.org/abs/1602.01783
class ActorCriticAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.last_gradient_update_step_idx = 0
self.action_advantages = self.register_signal('Advantages')
self.state_values = self.register_signal('Values')
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
# Discounting function used to calculate discounted returns.
def discount(self, x, gamma):
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
def get_general_advantage_estimation_values(self, rewards, values):
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
# Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
# although in practice works even in much smaller Tmax values, e.g. 20)
deltas = rewards + self.ap.algorithm.discount * values[1:] - values[:-1]
gae = self.discount(deltas, self.ap.algorithm.discount * self.ap.algorithm.gae_lambda)
if self.ap.algorithm.estimate_state_value_using_gae:
discounted_returns = np.expand_dims(gae + values[:-1], -1)
else:
discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
self.ap.algorithm.discount)), 1)[:-1]
return gae, discounted_returns
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
result = self.networks['main'].online_network.predict(batch.states(network_keys))
current_state_values = result[0]
self.state_values.add_sample(current_state_values)
# the targets for the state value estimator
num_transitions = batch.size
state_value_head_targets = np.zeros((num_transitions, 1))
# estimate the advantage function
action_advantages = np.zeros((num_transitions, 1))
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
if batch.game_overs()[-1]:
R = 0
else:
R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
for i in reversed(range(num_transitions)):
R = batch.rewards()[i] + self.ap.algorithm.discount * R
state_value_head_targets[i] = R
action_advantages[i] = R - current_state_values[i]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
values = np.append(current_state_values, bootstrapped_value)
if batch.game_overs()[-1]:
values[-1] = 0
# get general discounted returns table
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
action_advantages = np.vstack(gae_values)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
action_advantages = action_advantages.squeeze(axis=-1)
actions = batch.actions()
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
actions = np.expand_dims(actions, -1)
# train
result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
'output_1_0': actions},
[state_value_head_targets, action_advantages])
# logging
total_loss, losses, unclipped_grads = result[:3]
self.action_advantages.add_sample(action_advantages)
self.unclipped_grads.add_sample(unclipped_grads)
self.value_loss.add_sample(losses[0])
self.policy_loss.add_sample(losses[1])
return total_loss, losses, unclipped_grads
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value

791
rl_coach/agents/agent.py Normal file
View File

@@ -0,0 +1,791 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import random
from collections import OrderedDict
from typing import Dict, List, Union, Tuple
import numpy as np
from rl_coach.agents.agent_interface import AgentInterface
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
from rl_coach.core_types import RunPhase, PredictionType, EnvironmentEpisodes, ActionType, Batch, Episode, StateType
from rl_coach.core_types import Transition, ActionInfo, TrainingSteps, EnvironmentSteps, EnvResponse
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
from pandas import read_pickle
from six.moves import range
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace, GoalsSpace, AttentionActionSpace
from rl_coach.utils import Signal, force_list, set_cpu
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
from rl_coach.architectures.network_wrapper import NetworkWrapper
from rl_coach.logger import screen, Logger, EpisodeLogger
class Agent(AgentInterface):
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
"""
:param agent_parameters: A Preset class instance with all the running paramaters
"""
super().__init__()
self.ap = agent_parameters
self.task_id = self.ap.task_parameters.task_index
self.is_chief = self.task_id == 0
self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
and self.ap.memory.shared_memory
if self.shared_memory:
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
self.name = agent_parameters.name
self.parent = parent
self.parent_level_manager = None
self.full_name_id = agent_parameters.full_name_id = self.name
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
"tensorflow wake up time)".format(self.full_name_id, self.task_id))
else:
screen.log_title("Creating agent - name: {}".format(self.full_name_id))
self.imitation = False
self.agent_logger = Logger()
self.agent_episode_logger = EpisodeLogger()
# get the memory
# - distributed training + shared memory:
# * is chief? -> create the memory and add it to the scratchpad
# * not chief? -> wait for the chief to create the memory and then fetch it
# - non distributed training / not shared memory:
# * create memory
memory_name = self.ap.memory.path.split(':')[1]
self.memory_lookup_name = self.full_name_id + '.' + memory_name
if self.shared_memory and not self.is_chief:
self.memory = self.shared_memory_scratchpad.get(self.memory_lookup_name)
else:
# modules
if agent_parameters.memory.load_memory_from_file_path:
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
.format(agent_parameters.memory.load_memory_from_file_path))
self.memory = read_pickle(agent_parameters.memory.load_memory_from_file_path)
else:
self.memory = dynamic_import_and_instantiate_module_from_params(self.ap.memory)
if self.shared_memory and self.is_chief:
self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory)
# set devices
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
self.has_global = True
self.replicated_device = agent_parameters.task_parameters.device
self.worker_device = "/job:worker/task:{}".format(self.task_id)
else:
self.has_global = False
self.replicated_device = None
self.worker_device = ""
if agent_parameters.task_parameters.use_cpu:
self.worker_device += "/cpu:0"
else:
self.worker_device += "/device:GPU:0"
# filters
self.input_filter = self.ap.input_filter
self.output_filter = self.ap.output_filter
self.pre_network_filter = self.ap.pre_network_filter
device = self.replicated_device if self.replicated_device else self.worker_device
self.input_filter.set_device(device)
self.output_filter.set_device(device)
self.pre_network_filter.set_device(device)
# initialize all internal variables
self._phase = RunPhase.HEATUP
self.total_shaped_reward_in_current_episode = 0
self.total_reward_in_current_episode = 0
self.total_steps_counter = 0
self.running_reward = None
self.training_iteration = 0
self.last_target_network_update_step = 0
self.last_training_phase_step = 0
self.current_episode = self.ap.current_episode = 0
self.curr_state = {}
self.current_hrl_goal = None
self.current_episode_steps_counter = 0
self.episode_running_info = {}
self.last_episode_evaluation_ran = 0
self.running_observations = []
self.agent_logger.set_current_time(self.current_episode)
self.exploration_policy = None
self.networks = {}
self.last_action_info = None
self.running_observation_stats = None
self.running_reward_stats = None
self.accumulated_rewards_across_evaluation_episodes = 0
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
self.num_successes_across_evaluation_episodes = 0
self.num_evaluation_episodes_completed = 0
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
# TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)
# environment parameters
self.spaces = None
self.in_action_space = self.ap.algorithm.in_action_space
# signals
self.episode_signals = []
self.step_signals = []
self.loss = self.register_signal('Loss')
self.curr_learning_rate = self.register_signal('Learning Rate')
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
self.shaped_reward = self.register_signal('Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
if isinstance(self.in_action_space, GoalsSpace):
self.distance_from_goal = self.register_signal('Distance From Goal', dump_one_value_per_step=True)
# use seed
if self.ap.task_parameters.seed is not None:
random.seed(self.ap.task_parameters.seed)
np.random.seed(self.ap.task_parameters.seed)
@property
def parent(self):
"""
Get the parent class of the agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the agent.
Additionally, updates the full name of the agent
:param val: the new parent
:return: None
"""
self._parent = val
if self._parent is not None:
if not hasattr(self._parent, 'name'):
raise ValueError("The parent of an agent must have a name")
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
def setup_logger(self):
# dump documentation
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
level_name=self.parent_level_manager.name,
agent_full_id='.'.join(self.full_name_id.split('/')))
self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path, logger_prefix=logger_prefix,
add_timestamp=True, task_id=self.task_id)
if self.ap.visualization.dump_in_episode_signals:
self.agent_episode_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,
logger_prefix=logger_prefix,
add_timestamp=True, task_id=self.task_id)
def set_session(self, sess) -> None:
"""
Set the deep learning framework session for all the agents in the composite agent
:return: None
"""
self.input_filter.set_session(sess)
self.output_filter.set_session(sess)
self.pre_network_filter.set_session(sess)
[network.set_session(sess) for network in self.networks.values()]
def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True,
dump_one_value_per_step: bool=False) -> Signal:
"""
Register a signal such that its statistics will be dumped and be viewable through dashboard
:param signal_name: the name of the signal as it will appear in dashboard
:param dump_one_value_per_episode: should the signal value be written for each episode?
:param dump_one_value_per_step: should the signal value be written for each step?
:return: the created signal
"""
signal = Signal(signal_name)
if dump_one_value_per_episode:
self.episode_signals.append(signal)
if dump_one_value_per_step:
self.step_signals.append(signal)
return signal
def set_environment_parameters(self, spaces: SpacesDefinition):
"""
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
dependent on those values, by calling init_environment_dependent_modules
:param spaces: the environment spaces definition
:return: None
"""
self.spaces = copy.deepcopy(spaces)
if self.ap.algorithm.use_accumulated_reward_as_measurement:
if 'measurements' in self.spaces.state.sub_spaces:
self.spaces.state['measurements'].shape += 1
self.spaces.state['measurements'].measurements_names += ['accumulated_reward']
else:
self.spaces.state['measurements'] = VectorObservationSpace(1, measurements_names=['accumulated_reward'])
for observation_name in self.spaces.state.sub_spaces.keys():
self.spaces.state[observation_name] = \
self.pre_network_filter.get_filtered_observation_space(observation_name,
self.input_filter.get_filtered_observation_space(observation_name,
self.spaces.state[observation_name]))
self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
self.input_filter.get_filtered_reward_space(self.spaces.reward))
self.spaces.action = self.output_filter.get_unfiltered_action_space(self.spaces.action)
if isinstance(self.in_action_space, GoalsSpace):
# TODO: what if the goal type is an embedding / embedding change?
self.spaces.goal = self.in_action_space
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
self.init_environment_dependent_modules()
def create_networks(self) -> Dict[str, NetworkWrapper]:
"""
Create all the networks of the agent.
The network creation will be done after setting the environment parameters for the agent, since they are needed
for creating the network.
:return: A list containing all the networks
"""
networks = {}
for network_name in sorted(self.ap.network_wrappers.keys()):
networks[network_name] = NetworkWrapper(name=network_name,
agent_parameters=self.ap,
has_target=self.ap.network_wrappers[network_name].create_target_network,
has_global=self.has_global,
spaces=self.spaces,
replicated_device=self.replicated_device,
worker_device=self.worker_device)
return networks
def init_environment_dependent_modules(self) -> None:
"""
Initialize any modules that depend on knowing information about the environment such as the action space or
the observation space
:return: None
"""
# initialize exploration policy
self.ap.exploration.action_space = self.spaces.action
self.exploration_policy = dynamic_import_and_instantiate_module_from_params(self.ap.exploration)
# create all the networks of the agent
self.networks = self.create_networks()
@property
def phase(self) -> RunPhase:
return self._phase
@phase.setter
def phase(self, val: RunPhase) -> None:
"""
Change the phase of the run for the agent and all the sub components
:param phase: the new run phase (TRAIN, TEST, etc.)
:return: None
"""
self.reset_evaluation_state(val)
self._phase = val
self.exploration_policy.change_phase(val)
def reset_evaluation_state(self, val: RunPhase) -> None:
starting_evaluation = (val == RunPhase.TEST)
ending_evaluation = (self.phase == RunPhase.TEST)
if starting_evaluation:
self.accumulated_rewards_across_evaluation_episodes = 0
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
self.num_successes_across_evaluation_episodes = 0
self.num_evaluation_episodes_completed = 0
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
screen.log_title("{}: Starting evaluation phase".format(self.name))
elif ending_evaluation:
# we write to the next episode, because it could be that the current episode was already written
# to disk and then we won't write it again
self.agent_logger.set_current_time(self.current_episode + 1)
self.agent_logger.create_signal_value(
'Evaluation Reward',
self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
self.agent_logger.create_signal_value(
'Shaped Evaluation Reward',
self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
self.agent_logger.create_signal_value(
"Success Rate",
success_rate
)
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
screen.log_title("{}: Finished evaluation phase. Success rate = {}"
.format(self.name, np.round(success_rate, 2)))
def call_memory(self, func, args=()):
"""
This function is a wrapper to allow having the same calls for shared or unshared memories.
It should be used instead of calling the memory directly in order to allow different algorithms to work
both with a shared and a local memory.
:param func: the name of the memory function to call
:param args: the arguments to supply to the function
:return: the return value of the function
"""
if self.shared_memory:
result = self.shared_memory_scratchpad.internal_call(self.memory_lookup_name, func, args)
else:
if type(args) != tuple:
args = (args,)
result = getattr(self.memory, func)(*args)
return result
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Name"] = self.full_name_id
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
log["Exploration"] = np.round(self.exploration_policy.get_control_param(), 2)
log["Steps"] = self.total_steps_counter
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix=self.phase.value)
def update_step_in_episode_log(self):
"""
Writes logging messages to screen and updates the log file with all the signal values.
:return: None
"""
# log all the signals to file
self.agent_episode_logger.set_current_time(self.current_episode_steps_counter)
self.agent_episode_logger.create_signal_value('Training Iter', self.training_iteration)
self.agent_episode_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
self.agent_episode_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
self.agent_episode_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
self.agent_episode_logger.create_signal_value('Total steps', self.total_steps_counter)
self.agent_episode_logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
self.agent_episode_logger.create_signal_value("Shaped Accumulated Reward", self.total_shaped_reward_in_current_episode)
self.agent_episode_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.agent_episode_logger.update_wall_clock_time(self.current_episode_steps_counter)
for signal in self.step_signals:
self.agent_episode_logger.create_signal_value(signal.name, signal.get_last_value())
# dump
self.agent_episode_logger.dump_output_csv()
def update_log(self):
"""
Writes logging messages to screen and updates the log file with all the signal values.
:return: None
"""
# log all the signals to file
self.agent_logger.set_current_time(self.current_episode)
self.agent_logger.create_signal_value('Training Iter', self.training_iteration)
self.agent_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
self.agent_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
self.agent_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
self.agent_logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
self.agent_logger.create_signal_value('Total steps', self.total_steps_counter)
self.agent_logger.create_signal_value("Epsilon", np.mean(self.exploration_policy.get_control_param()))
self.agent_logger.create_signal_value("Shaped Training Reward", self.total_shaped_reward_in_current_episode
if self._phase == RunPhase.TRAIN else np.nan)
self.agent_logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
if self._phase == RunPhase.TRAIN else np.nan)
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.agent_logger.update_wall_clock_time(self.current_episode)
if self._phase != RunPhase.TEST:
self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
for signal in self.episode_signals:
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
self.agent_logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
self.agent_logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
# dump
if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
and self.current_episode > 0:
self.agent_logger.dump_output_csv()
def handle_episode_ended(self) -> None:
"""
End an episode
:return: None
"""
self.current_episode_buffer.is_complete = True
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
self.current_episode += 1
if self.phase != RunPhase.TEST and isinstance(self.memory, EpisodicExperienceReplay):
self.call_memory('store_episode', self.current_episode_buffer)
if self.phase == RunPhase.TEST:
self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
self.num_evaluation_episodes_completed += 1
if self.spaces.reward.reward_success_threshold and \
self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
self.num_successes_across_evaluation_episodes += 1
if self.ap.visualization.dump_csv:
self.update_log()
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
self.log_to_screen()
def reset_internal_state(self):
"""
Reset all the episodic parameters
:return: None
"""
for signal in self.episode_signals:
signal.reset()
for signal in self.step_signals:
signal.reset()
self.agent_episode_logger.set_episode_idx(self.current_episode)
self.total_shaped_reward_in_current_episode = 0
self.total_reward_in_current_episode = 0
self.curr_state = {}
self.current_episode_steps_counter = 0
self.episode_running_info = {}
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
if self.exploration_policy:
self.exploration_policy.reset()
self.input_filter.reset()
self.output_filter.reset()
self.pre_network_filter.reset()
if isinstance(self.memory, EpisodicExperienceReplay):
self.call_memory('verify_last_episode_is_closed')
for network in self.networks.values():
network.online_network.reset_internal_memory()
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
"""
Given a batch of transitions, calculates their target values and updates the network.
:param batch: A list of transitions
:return: The total loss of the training, the loss per head and the unclipped gradients
"""
return 0, [], []
def _should_update_online_weights_to_target(self):
"""
Determine if online weights should be copied to the target.
:return: boolean: True if the online weights should be copied to the target.
"""
# update the target network of every network that has a target network
step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
if step_method.__class__ == TrainingSteps:
should_update = (self.training_iteration - self.last_target_network_update_step) >= step_method.num_steps
if should_update:
self.last_target_network_update_step = self.training_iteration
elif step_method.__class__ == EnvironmentSteps:
should_update = (self.total_steps_counter - self.last_target_network_update_step) >= step_method.num_steps
if should_update:
self.last_target_network_update_step = self.total_steps_counter
else:
raise ValueError("The num_steps_between_copying_online_weights_to_target parameter should be either "
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
return should_update
def _should_train(self, wait_for_full_episode=False):
"""
Determine if we should start a training phase according to the number of steps passed since the last training
:return: boolean: True if we should start a training phase
"""
step_method = self.ap.algorithm.num_consecutive_playing_steps
if step_method.__class__ == EnvironmentEpisodes:
should_update = (self.current_episode - self.last_training_phase_step) >= step_method.num_steps
if should_update:
self.last_training_phase_step = self.current_episode
elif step_method.__class__ == EnvironmentSteps:
should_update = (self.total_steps_counter - self.last_training_phase_step) >= step_method.num_steps
if wait_for_full_episode:
should_update = should_update and self.current_episode_steps_counter == 0
if should_update:
self.last_training_phase_step = self.total_steps_counter
else:
raise ValueError("The num_consecutive_playing_steps parameter should be either "
"EnvironmentSteps or Episodes. Instead it is {}".format(step_method.__class__))
return should_update
def train(self):
"""
Check if a training phase should be done as configured by num_consecutive_playing_steps.
If it should, then do several training steps as configured by num_consecutive_training_steps.
A single training iteration: Sample a batch, train on it and update target networks.
:return: The total training loss during the training iterations.
"""
loss = 0
if self._should_train():
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
# TODO: this should be network dependent
network_parameters = list(self.ap.network_wrappers.values())[0]
# update counters
self.training_iteration += 1
# sample a batch and train on it
batch = self.call_memory('sample', network_parameters.batch_size)
if self.pre_network_filter is not None:
batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False)
# if the batch returned empty then there are not enough samples in the replay buffer -> skip
# training step
if len(batch) > 0:
# train
batch = Batch(batch)
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
loss += total_loss
self.unclipped_grads.add_sample(unclipped_grads)
# TODO: the learning rate decay should be done through the network instead of here
# decay learning rate
if network_parameters.learning_rate_decay_rate != 0:
self.curr_learning_rate.add_sample(self.networks['main'].sess.run(
self.networks['main'].online_network.current_learning_rate))
else:
self.curr_learning_rate.add_sample(network_parameters.learning_rate)
if any([network.has_target for network in self.networks.values()]) \
and self._should_update_online_weights_to_target():
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
self.agent_logger.create_signal_value('Update Target Network', 1)
else:
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.loss.add_sample(loss)
if self.imitation:
self.log_to_screen()
# run additional commands after the training is done
self.post_training_commands()
return loss
def choose_action(self, curr_state):
"""
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
or testing.
:param curr_state: the current state to act upon.
:return: chosen action, some action value describing the action (q-value, probability, etc)
"""
pass
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
network_name: str):
"""
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
"""
# convert to batch so we can run it through the network
states = force_list(states)
batches_dict = {}
for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
# addition to the current_state, so that all the inputs of the network will be filled)
if key in states[0].keys():
batches_dict[key] = np.array([np.array(state[key]) for state in states])
return batches_dict
def act(self) -> ActionInfo:
"""
Given the agents current knowledge, decide on the next action to apply to the environment
:return: an action and a dictionary containing any additional info from the action decision process
"""
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
# This agent never plays while training (e.g. behavioral cloning)
return None
# count steps (only when training or if we are in the evaluation worker)
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
self.total_steps_counter += 1
self.current_episode_steps_counter += 1
# decide on the action
if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
# random action
self.last_action_info = self.spaces.action.sample_with_info()
else:
# informed action
if self.pre_network_filter is not None:
# before choosing an action, first use the pre_network_filter to filter out the current state
curr_state = self.run_pre_network_filter_for_inference(self.curr_state)
else:
curr_state = self.curr_state
self.last_action_info = self.choose_action(curr_state)
filtered_action_info = self.output_filter.filter(self.last_action_info)
return filtered_action_info
def run_pre_network_filter_for_inference(self, state: StateType):
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
def get_state_embedding(self, state: dict) -> np.ndarray:
"""
Given a state, get the corresponding state embedding from the main network
:param state: a state dict
:return: a numpy embedding vector
"""
# TODO: this won't work anymore
# TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
embedding = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(state, "main"),
outputs=self.networks['main'].online_network.state_embedding)
return embedding
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
"""
Allows agents to update the transition just before adding it to the replay buffer.
Can be useful for agents that want to tweak the reward, termination signal, etc.
:param transition: the transition to update
:return: the updated transition
"""
return transition
def observe(self, env_response: EnvResponse) -> bool:
"""
Given a response from the environment, distill the observation from it and store it for later use.
The response should be a dictionary containing the performed action, the new observation and measurements,
the reward, a game over flag and any additional information necessary.
:param env_response: result of call from environment.step(action)
:return:
"""
# filter the env_response
filtered_env_response = self.input_filter.filter(env_response)[0]
# inject agent collected statistics, if required
if self.ap.algorithm.use_accumulated_reward_as_measurement:
if 'measurements' in filtered_env_response.next_state:
filtered_env_response.next_state['measurements'] = np.append(filtered_env_response.next_state['measurements'],
self.total_shaped_reward_in_current_episode)
else:
filtered_env_response.next_state['measurements'] = np.array([self.total_shaped_reward_in_current_episode])
# if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
# transition yet, and therefore we don't need to store anything in the memory.
# also we did not reach the goal yet.
if self.current_episode_steps_counter == 0:
# initialize the current state
self.curr_state = filtered_env_response.next_state
return env_response.game_over
else:
transition = Transition(state=copy.copy(self.curr_state), action=self.last_action_info.action,
reward=filtered_env_response.reward, next_state=filtered_env_response.next_state,
game_over=filtered_env_response.game_over, info=filtered_env_response.info)
# now that we have formed a basic transition - the next state progresses to be the current state
self.curr_state = filtered_env_response.next_state
# make agent specific changes to the transition if needed
transition = self.update_transition_before_adding_to_replay_buffer(transition)
# merge the intrinsic reward in
if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
transition.reward = transition.reward * (1 + self.last_action_info.action_intrinsic_reward)
else:
transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward
# sum up the total shaped reward
self.total_shaped_reward_in_current_episode += transition.reward
self.total_reward_in_current_episode += env_response.reward
self.shaped_reward.add_sample(transition.reward)
self.reward.add_sample(env_response.reward)
# add action info to transition
if type(self.parent).__name__ == 'CompositeAgent':
transition.add_info(self.parent.last_action_info.__dict__)
else:
transition.add_info(self.last_action_info.__dict__)
# create and store the transition
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
# for regular memories we insert the transitions directly to the memory
if isinstance(self.memory, EpisodicExperienceReplay):
self.current_episode_buffer.insert(transition)
else:
self.call_memory('store', transition)
if self.ap.visualization.dump_in_episode_signals:
self.update_step_in_episode_log()
return transition.game_over
def post_training_commands(self):
pass
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
"""
Get a prediction from the agent with regard to the requested prediction_type.
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
raise a ValueException.
:param states:
:param prediction_type:
:return:
"""
predictions = self.networks['main'].online_network.predict_with_prediction_type(
# states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
states=states, prediction_type=prediction_type)
if len(predictions.keys()) != 1:
raise ValueError("The network has more than one component {} matching the requested prediction_type {}. ".
format(list(predictions.keys()), prediction_type))
return list(predictions.values())[0]
def set_incoming_directive(self, action: ActionType) -> None:
if isinstance(self.in_action_space, GoalsSpace):
self.current_hrl_goal = action
elif isinstance(self.in_action_space, AttentionActionSpace):
self.input_filter.observation_filters['attention'].crop_low = action[0]
self.input_filter.observation_filters['attention'].crop_high = action[1]
self.output_filter.action_filters['masking'].set_masking(action[0], action[1])
def save_checkpoint(self, checkpoint_id: int) -> None:
"""
Allows agents to store additional information when saving checkpoints.
:param checkpoint_id: the id of the checkpoint
:return: None
"""
pass
def sync(self) -> None:
"""
Sync the global network parameters to local networks
:return: None
"""
for network in self.networks.values():
network.sync()

View File

@@ -0,0 +1,125 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union, List, Dict
import numpy as np
from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, PredictionType, ActionType
class AgentInterface(object):
def __init__(self):
self._phase = RunPhase.HEATUP
self._parent = None
self.spaces = None
@property
def parent(self):
"""
Get the parent class of the agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the agent
:param val: the new parent
:return: None
"""
self._parent = val
@property
def phase(self) -> RunPhase:
"""
Get the phase of the agent
:return: the current phase
"""
return self._phase
@phase.setter
def phase(self, val: RunPhase):
"""
Change the phase of the agent
:param val: the new phase
:return: None
"""
self._phase = val
def reset_internal_state(self) -> None:
"""
Reset the episode parameters for the agent
:return: None
"""
raise NotImplementedError("")
def train(self) -> Union[float, List]:
"""
Train the agents network
:return: The loss of the training
"""
raise NotImplementedError("")
def act(self) -> ActionInfo:
"""
Get a decision of the next action to take.
The action is dependent on the current state which the agent holds from resetting the environment or from
the observe function.
:return: A tuple containing the actual action and additional info on the action
"""
raise NotImplementedError("")
def observe(self, env_response: EnvResponse) -> bool:
"""
Gets a response from the environment.
Processes this information for later use. For example, create a transition and store it in memory.
The action info (a class containing any info the agent wants to store regarding its action decision process) is
stored by the agent itself when deciding on the action.
:param env_response: a EnvResponse containing the response from the environment
:return: a done signal which is based on the agent knowledge. This can be different from the done signal from
the environment. For example, an agent can decide to finish the episode each time it gets some
intrinsic reward
"""
raise NotImplementedError("")
def save_checkpoint(self, checkpoint_id: int) -> None:
"""
Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
:param checkpoint_id: the checkpoint id to use for saving
:return: None
"""
raise NotImplementedError("")
def get_predictions(self, states: Dict, prediction_type: PredictionType) -> np.ndarray:
"""
Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
:param states:
:param prediction_type:
:return: the agent's prediction
"""
raise NotImplementedError("")
def set_incoming_directive(self, action: ActionType) -> None:
"""
Pass a higher level command (directive) to the agent.
For example, a higher level agent can set the goal of the agent.
:param action: the directive to pass to the agent
:return: None
"""
raise NotImplementedError("")

View File

@@ -0,0 +1,81 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.imitation_agent import ImitationAgent
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.base_parameters import AgentParameters, AlgorithmParameters, NetworkParameters, InputEmbedderParameters, \
MiddlewareScheme
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class BCAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.collect_new_data = False
class BCNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
class BCAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=BCAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": BCNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.bc_agent:BCAgent'
# Behavioral Cloning Agent
class BCAgent(ImitationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# When using a policy head, the targets refer to the advantages that we are normally feeding the head with.
# In this case, we need the policy head to just predict probabilities, so while we usually train the network
# with log(Pi)*Advantages, in this specific case we will train it to log(Pi), which after the softmax will
# predict Pi (=probabilities)
targets = np.ones(batch.actions().shape[0])
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_0_0': batch.actions()},
targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,84 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.exploration_policies.bootstrapped import BootstrappedParameters
class BootstrappedDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.num_output_head_copies = 10
self.rescale_gradient_from_head_by_factor = [1.0/self.num_output_head_copies]*self.num_output_head_copies
class BootstrappedDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.network_wrappers = {"main": BootstrappedDQNNetworkParameters()}
self.exploration = BootstrappedParameters()
@property
def path(self):
return 'rl_coach.agents.bootstrapped_dqn_agent:BootstrappedDQNAgent'
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
class BootstrappedDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def reset_internal_state(self):
super().reset_internal_state()
self.exploration_policy.select_head()
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
next_states_online_values = self.networks['main'].online_network.predict(batch.next_states(network_keys))
result = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
q_st_plus_1 = result[:self.ap.exploration.architecture_num_q_heads]
TD_targets = result[self.ap.exploration.architecture_num_q_heads:]
# initialize with the current prediction so that we will
# only update the action that we have actually done in this transition
for i in range(self.ap.network_wrappers['main'].batch_size):
mask = batch[i].info['mask']
for head_idx in range(self.ap.exploration.architecture_num_q_heads):
if mask[head_idx] == 1:
selected_action = np.argmax(next_states_online_values[head_idx][i], 0)
TD_targets[head_idx][i, batch.actions()[i]] = \
batch.rewards()[i] + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount \
* q_st_plus_1[head_idx][i][selected_action]
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def observe(self, env_response):
mask = np.random.binomial(1, self.ap.exploration.bootstrapped_data_sharing_probability,
self.ap.exploration.architecture_num_q_heads)
env_response.info['mask'] = mask
return super().observe(env_response)

View File

@@ -0,0 +1,114 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNNetworkParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.categorical_q_head import CategoricalQHeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.core_types import StateType
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class CategoricalDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.heads_parameters = [CategoricalQHeadParameters()]
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.v_min = -10.0
self.v_max = 10.0
self.atoms = 51
class CategoricalDQNExplorationParameters(EGreedyParameters):
def __init__(self):
super().__init__()
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.evaluation_epsilon = 0.001
class CategoricalDQNAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
exploration=CategoricalDQNExplorationParameters(),
memory=ExperienceReplayParameters(),
networks={"main": CategoricalDQNNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.categorical_dqn_agent:CategoricalDQNAgent'
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
class CategoricalDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)
def distribution_prediction_to_q_values(self, prediction):
return np.dot(prediction, self.z_values)
# prediction's format is (batch,actions,atoms)
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
prediction = self.get_prediction(states)
q_values = self.distribution_prediction_to_q_values(prediction)
else:
q_values = None
return q_values
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the action we actually took, the error is calculated by the atoms distribution
# for all other actions, the error is 0
distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# only update the action that we have actually done in this transition
target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
batches = np.arange(self.ap.network_wrappers['main'].batch_size)
for j in range(self.z_values.size):
tzj = np.fmax(np.fmin(batch.rewards() +
(1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
self.z_values[self.z_values.size - 1]),
self.z_values[0])
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
u = (np.ceil(bj)).astype(int)
l = (np.floor(bj)).astype(int)
m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
# total_loss = cross entropy between actual result above and predicted result for the given action
TD_targets[batches, batch.actions()] = m
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,277 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from collections import OrderedDict
from random import shuffle
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.core_types import EnvironmentSteps, Batch, EnvResponse, StateType
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.schedules import ConstantSchedule
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
from rl_coach.logger import screen
class ClippedPPONetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
self.loss_weights = [1.0, 1.0]
self.rescale_gradient_from_head_by_factor = [1, 1]
self.batch_size = 64
self.optimizer_type = 'Adam'
self.clip_gradients = None
self.use_separate_networks_per_head = True
self.async_training = False
self.l2_regularization = 0
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_episodes_in_experience_replay = 1000000
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
self.gae_lambda = 0.95
self.use_kl_regularization = False
self.clip_likelihood_ratio_using_epsilon = 0.2
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.beta_entropy = 0.01 # should be 0 for mujoco
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
self.optimization_epochs = 10
self.normalization_stats = None
self.clipping_decay_schedule = ConstantSchedule(1)
class ClippedPPOAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": ClippedPPONetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.clipped_ppo_agent:ClippedPPOAgent'
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
class ClippedPPOAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.value_targets = self.register_signal('Value Targets')
self.kl_divergence = self.register_signal('KL Divergence')
self.likelihood_ratio = self.register_signal('Likelihood Ratio')
self.clipped_likelihood_ratio = self.register_signal('Clipped Likelihood Ratio')
def set_session(self, sess):
super().set_session(sess)
if self.ap.algorithm.normalization_stats is not None:
self.ap.algorithm.normalization_stats.set_session(sess)
def fill_advantages(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
current_state_values = self.networks['main'].online_network.predict(batch.states(network_keys))[0]
current_state_values = current_state_values.squeeze()
self.state_values.add_sample(current_state_values)
# calculate advantages
advantages = []
value_targets = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
advantages = batch.total_returns() - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
value_targets = np.array([])
for idx, game_over in enumerate(batch.game_overs()):
if game_over:
# get advantages for the rollout
value_bootstrapping = np.zeros((1,))
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
rollout_advantages, gae_based_value_targets = \
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
rollout_state_values)
episode_start_idx = idx + 1
advantages = np.append(advantages, rollout_advantages)
value_targets = np.append(value_targets, gae_based_value_targets)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
for transition, advantage, value_target in zip(batch.transitions, advantages, value_targets):
transition.info['advantage'] = advantage
transition.info['gae_based_value_target'] = value_target
self.action_advantages.add_sample(advantages)
def train_network(self, batch, epochs):
batch_results = []
for j in range(epochs):
batch.shuffle()
batch_results = {
'total_loss': [],
'losses': [],
'unclipped_grads': [],
'kl_divergence': [],
'entropy': []
}
fetches = [self.networks['main'].online_network.output_heads[1].kl_divergence,
self.networks['main'].online_network.output_heads[1].entropy,
self.networks['main'].online_network.output_heads[1].likelihood_ratio,
self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
for i in range(int(batch.size / self.ap.network_wrappers['main'].batch_size)):
start = i * self.ap.network_wrappers['main'].batch_size
end = (i + 1) * self.ap.network_wrappers['main'].batch_size
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
actions = batch.actions()[start:end]
gae_based_value_targets = batch.info('gae_based_value_target')[start:end]
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
actions = np.expand_dims(actions, -1)
# get old policy probabilities and distribution
# TODO-perf - the target network ("old_policy") is not changing. this can be calculated once for all epochs.
# the shuffling being done, should only be performed on the indices.
result = self.networks['main'].target_network.predict({k: v[start:end] for k, v in batch.states(network_keys).items()})
old_policy_distribution = result[1:]
# calculate gradients and apply on both the local policy network and on the global policy network
if self.ap.algorithm.estimate_state_value_using_gae:
value_targets = np.expand_dims(gae_based_value_targets, -1)
else:
value_targets = batch.total_returns(expand_dims=True)[start:end]
inputs = copy.copy({k: v[start:end] for k, v in batch.states(network_keys).items()})
inputs['output_1_0'] = actions
# The old_policy_distribution needs to be represented as a list, because in the event of
# discrete controls, it has just a mean. otherwise, it has both a mean and standard deviation
for input_index, input in enumerate(old_policy_distribution):
inputs['output_1_{}'.format(input_index + 1)] = input
inputs['output_1_3'] = self.ap.algorithm.clipping_decay_schedule.current_value
total_loss, losses, unclipped_grads, fetch_result = \
self.networks['main'].train_and_sync_networks(
inputs, [value_targets, batch.info('advantage')[start:end]], additional_fetches=fetches
)
batch_results['total_loss'].append(total_loss)
batch_results['losses'].append(losses)
batch_results['unclipped_grads'].append(unclipped_grads)
batch_results['kl_divergence'].append(fetch_result[0])
batch_results['entropy'].append(fetch_result[1])
self.unclipped_grads.add_sample(unclipped_grads)
self.value_targets.add_sample(value_targets)
self.likelihood_ratio.add_sample(fetch_result[2])
self.clipped_likelihood_ratio.add_sample(fetch_result[3])
for key in batch_results.keys():
batch_results[key] = np.mean(batch_results[key], 0)
self.value_loss.add_sample(batch_results['losses'][0])
self.policy_loss.add_sample(batch_results['losses'][1])
if self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
curr_learning_rate = self.networks['main'].online_network.get_variable_value(
self.networks['main'].online_network.adaptive_learning_rate_scheme)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.ap.network_wrappers['main'].learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
("Surrogate loss", batch_results['losses'][1]),
("KL divergence", batch_results['kl_divergence']),
("Entropy", batch_results['entropy']),
("training epoch", j),
("learning_rate", curr_learning_rate)
]),
prefix="Policy training"
)
self.total_kl_divergence_during_training_process = batch_results['kl_divergence']
self.entropy.add_sample(batch_results['entropy'])
self.kl_divergence.add_sample(batch_results['kl_divergence'])
return batch_results['losses']
def post_training_commands(self):
# clean memory
self.call_memory('clean')
def train(self):
if self._should_train(wait_for_full_episode=True):
dataset = self.memory.transitions
dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
batch = Batch(dataset)
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
self.networks['main'].sync()
self.fill_advantages(batch)
# take only the requested number of steps
if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
shuffle(dataset)
batch = Batch(dataset)
self.train_network(batch, self.ap.algorithm.optimization_epochs)
self.post_training_commands()
self.training_iteration += 1
# self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return None
def run_pre_network_filter_for_inference(self, state: StateType):
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state
def choose_action(self, curr_state):
self.ap.algorithm.clipping_decay_schedule.step()
return super().choose_action(curr_state)

View File

@@ -0,0 +1,415 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import itertools
from enum import Enum
from typing import Union, List, Dict
import numpy as np
from rl_coach.agents.agent_interface import AgentInterface
from rl_coach.base_parameters import AgentParameters, VisualizationParameters
# from rl_coach.environments.environment_interface import ActionSpace
from rl_coach.spaces import ActionSpace
from rl_coach.spaces import AgentSelection, AttentionActionSpace, ObservationSpace, SpacesDefinition
from rl_coach.utils import short_dynamic_import
from rl_coach.core_types import ActionInfo, EnvResponse, ActionType, RunPhase
from rl_coach.filters.observation.observation_crop_filter import ObservationCropFilter
class DecisionPolicy(object):
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, decide on a single action to take.
:param actions_info: a dictionary of agent names and their corresponding
ActionInfo instances containing information for each agents action
:return: a single action and the corresponding action info
"""
raise NotImplementedError("")
class SingleDecider(DecisionPolicy):
"""
A decision policy that chooses the action according to the agent that is currently in control.
"""
def __init__(self, default_decision_maker: str):
super().__init__()
self._decision_maker = default_decision_maker
@property
def decision_maker(self):
"""
Get the decision maker that was set by the upper level control.
"""
return self._decision_maker
@decision_maker.setter
def decision_maker(self, decision_maker: str):
"""
Set the decision maker by the upper level control.
:param action: the incoming action from the upper level control.
"""
self._decision_maker = decision_maker
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action of the current decision maker
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
if self.decision_maker not in actions_info.keys():
raise ValueError("The current decision maker ({}) does not exist in the given actions ({})"
.format(self.decision_maker, actions_info.keys()))
return actions_info[self.decision_maker]
class RoundRobin(DecisionPolicy):
"""
A decision policy that chooses the action according to agents selected in a circular order.
"""
def __init__(self, num_agents: int):
super().__init__()
self.round_robin = itertools.cycle(range(num_agents))
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action of the current decision maker, which is set in a
circular order
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
decision_maker = self.round_robin.__next__()
if decision_maker not in range(len(actions_info.keys())):
raise ValueError("The size of action_info does not match the number of agents set to RoundRobin decision"
" policy.")
return actions_info.items()[decision_maker]
class MajorityVote(DecisionPolicy):
"""
A decision policy that chooses the action that most of the agents chose.
This policy is only useful for discrete control.
"""
def __init__(self):
super().__init__()
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action that most agents agree on
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
# TODO: enforce discrete action spaces
if len(actions_info.keys()) == 0:
raise ValueError("The given list of actions is empty")
vote_count = np.bincount([action_info.action for action_info in actions_info.values()])
majority_vote = np.argmax(vote_count)
return actions_info.items()[majority_vote]
class MeanDecision(DecisionPolicy):
"""
A decision policy that takes the mean action given the actions of all the agents.
This policy is only useful for continuous control.
"""
def __init__(self):
super().__init__()
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the mean action
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
# TODO: enforce continuous action spaces
if len(actions_info.keys()) == 0:
raise ValueError("The given list of actions is empty")
mean = np.mean([action_info.action for action_info in actions_info.values()], axis=0)
return ActionInfo(mean)
class RewardPolicy(Enum):
ReachingGoal = 0
NativeEnvironmentReward = 1
AccumulatedEnvironmentRewards = 2
class CompositeAgent(AgentInterface):
"""
A CompositeAgent is a group of agents in the same hierarchy level.
In a CompositeAgent, each agent may take the role of either a controller or an observer.
Each agent that is defined as observer, gets observations from the environment.
Each agent that is defined as controller, can potentially also control the environment, in addition to observing it.
There are several ways to decide on the action from different controller agents:
1. Ensemble -
- Take the majority vote (discrete controls)
- Take the mean action (continuous controls)
- Round robin between the agents (discrete/continuous)
2. Skills -
- At each step a single agent decides (Chosen by the uppoer hierarchy controlling agent)
A CompositeAgent can be controlled using one of the following methods (ActionSpaces):
1. Goals (in terms of measurements, observation, embedding or a change in those values)
2. Agent Selection (skills) / Discrete action space.
3. Attention (a subset of the real environment observation / action space)
"""
def __init__(self,
agents_parameters: Union[AgentParameters, Dict[str, AgentParameters]],
visualization_parameters: VisualizationParameters,
decision_policy: DecisionPolicy,
out_action_space: ActionSpace,
in_action_space: Union[None, ActionSpace]=None,
decision_makers: Union[bool, Dict[str, bool]]=True,
reward_policy: RewardPolicy=RewardPolicy.NativeEnvironmentReward,
name="CompositeAgent"):
"""
Construct an agent group
:param agents_parameters: a list of presets describing each one of the agents in the group
:param decision_policy: the decision policy of the group which describes how actions are consolidated
:param out_action_space: the type of action space that is used by this composite agent in order to control the
underlying environment
:param in_action_space: the type of action space that is used by the upper level agent in order to control this
group
:param decision_makers: a list of booleans representing for each corresponding agent if it has a decision
privilege or if it is just an observer
:param reward_policy: the type of the reward that the group receives
"""
super().__init__()
if isinstance(agents_parameters, AgentParameters):
decision_makers = {agents_parameters.name: True}
agents_parameters = {agents_parameters.name: agents_parameters}
self.agents_parameters = agents_parameters
self.visualization_parameters = visualization_parameters
self.decision_makers = decision_makers
self.decision_policy = decision_policy
self.in_action_space = in_action_space
self.out_action_space = out_action_space # TODO: this is not being used
self.reward_policy = reward_policy
self.full_name_id = self.name = name
self.current_decision_maker = 0
self.environment = None
self.agents = {} # key = agent_name, value = agent
self.incoming_action = None
self.last_state = None
self._phase = RunPhase.HEATUP
self.last_action_info = None
self.current_episode = 0
self.parent_level_manager = None
# environment spaces
self.spaces = None
# counters for logging
self.total_steps_counter = 0
self.current_episode_steps_counter = 0
self.total_reward_in_current_episode = 0
# validate input
if set(self.decision_makers) != set(self.agents_parameters):
raise ValueError("The decision_makers dictionary keys does not match the names of the given agents")
if sum(self.decision_makers.values()) > 1 and type(self.decision_policy) == SingleDecider \
and type(self.in_action_space) != AgentSelection:
raise ValueError("When the control policy is set to single decider, the master policy should control the"
"agent group via agent selection (ControlType.AgentSelection)")
@property
def parent(self):
"""
Get the parent class of the composite agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the composite agent.
Additionally, updates the full name of the agent
:param val: the new parent
:return: None
"""
self._parent = val
if not hasattr(self._parent, 'name'):
raise ValueError("The parent of a composite agent must have a name")
self.full_name_id = "{}/{}".format(self._parent.name, self.name)
def create_agents(self):
for agent_name, agent_parameters in self.agents_parameters.items():
agent_parameters.name = agent_name
# create agent
self.agents[agent_parameters.name] = short_dynamic_import(agent_parameters.path)(agent_parameters,
parent=self)
self.agents[agent_parameters.name].parent_level_manager = self.parent_level_manager
# TODO: this is a bit too specific to be defined here
# add an attention cropping filter if the incoming directives are attention boxes
if isinstance(self.in_action_space, AttentionActionSpace):
attention_size = self.in_action_space.forced_attention_size
for agent in self.agents.values():
agent.input_filter.observation_filters['attention'] = \
ObservationCropFilter(crop_low=np.zeros_like(attention_size), crop_high=attention_size)
agent.input_filter.observation_filters.move_to_end('attention', last=False) # add the cropping at the beginning
def setup_logger(self) -> None:
"""
Setup the logger for all the agents in the composite agent
:return: None
"""
[agent.setup_logger() for agent in self.agents.values()]
def set_session(self, sess) -> None:
"""
Set the deep learning framework session for all the agents in the composite agent
:return: None
"""
[agent.set_session(sess) for agent in self.agents.values()]
def set_environment_parameters(self, spaces: SpacesDefinition):
"""
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
dependent on those values, by calling init_environment_dependent_modules
:param spaces: the definitions of all the spaces of the environment
:return: None
"""
self.spaces = copy.deepcopy(spaces)
[agent.set_environment_parameters(self.spaces) for agent in self.agents.values()]
@property
def phase(self):
return self._phase
@phase.setter
def phase(self, val: RunPhase) -> None:
"""
Change the current phase of all the agents in the group
:param phase: the new phase
:return: None
"""
self._phase = val
for agent in self.agents.values():
agent.phase = val
def end_episode(self) -> None:
"""
End an episode
:return: None
"""
self.current_episode += 1
[agent.handle_episode_ended() for agent in self.agents.values()]
def reset_internal_state(self) -> None:
"""
Reset the episode for all the agents in the group
:return: None
"""
# update counters
self.total_steps_counter = 0
self.current_episode_steps_counter = 0
self.total_reward_in_current_episode = 0
# reset all sub modules
[agent.reset_internal_state() for agent in self.agents.values()]
def train(self) -> Union[float, List]:
"""
Make a single training step for all the agents of the group
:return: a list of loss values from the training step
"""
return [agent.train() for agent in self.agents.values()]
def act(self) -> ActionInfo:
"""
Get the actions from all the agents in the group. Then use the decision policy in order to
extract a single action out of the list of actions.
:return: the chosen action and its corresponding information
"""
# update counters
self.total_steps_counter += 1
self.current_episode_steps_counter += 1
# get the actions info from all the agents
actions_info = {}
for agent_name, agent in self.agents.items():
action_info = agent.act()
actions_info[agent_name] = action_info
# decide on a single action to apply to the environment
action_info = self.decision_policy.choose_action(actions_info)
# TODO: make the last action info a property?
# pass the action info to all the observers
for agent_name, is_decision_maker in self.decision_makers.items():
if not is_decision_maker:
self.agents[agent_name].last_action_info = action_info
self.last_action_info = action_info
return self.last_action_info
def observe(self, env_response: EnvResponse) -> bool:
"""
Given a response from the environment as a env_response, filter it and pass it to the agents.
This method has two main jobs:
1. Wrap the previous transition, ending with the new observation coming from EnvResponse.
2. Save the next_state as the current_state to take action upon for the next call to act().
:param env_response:
:param action_info: additional info about the chosen action
:return:
"""
# accumulate the unfiltered rewards for visualization
self.total_reward_in_current_episode += env_response.reward
episode_ended = env_response.game_over
# pass the env_response to all the sub-agents
# TODO: what if one agent decides to end the episode but the others don't? who decides?
for agent_name, agent in self.agents.items():
goal_reached = agent.observe(env_response)
episode_ended = episode_ended or goal_reached
# TODO: unlike for a single agent, here we also treat a game over by the environment.
# probably better to only return the agents' goal_reached decisions.
return episode_ended
def save_checkpoint(self, checkpoint_id: int) -> None:
[agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
def set_incoming_directive(self, action: ActionType) -> None:
self.incoming_action = action
if isinstance(self.decision_policy, SingleDecider) and isinstance(self.in_action_space, AgentSelection):
self.decision_policy.decision_maker = list(self.agents.keys())[action]
if isinstance(self.in_action_space, AttentionActionSpace):
# TODO: redesign to be more modular
for agent in self.agents.values():
agent.input_filter.observation_filters['attention'].crop_low = action[0]
agent.input_filter.observation_filters['attention'].crop_high = action[1]
agent.output_filter.action_filters['masking'].set_masking(action[0], action[1])
# TODO rethink this scheme. we don't want so many if else clauses lying around here.  
# TODO - for incoming actions which do not involve setting the acting agent we should change the
# observation_space, goal to pursue, etc accordingly to the incoming action.
def sync(self) -> None:
"""
Sync the agent networks with the global network
:return:
"""
[agent.sync() for agent in self.agents.values()]

View File

@@ -0,0 +1,192 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.agent import Agent
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
AgentParameters, InputEmbedderParameters, EmbedderScheme
from rl_coach.exploration_policies.ou_process import OUProcessParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import BoxActionSpace, GoalsSpace
from rl_coach.architectures.tensorflow_components.heads.ddpg_actor_head import DDPGActorHeadParameters
from rl_coach.core_types import ActionInfo, EnvironmentSteps
class DDPGCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True),
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [VHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class DDPGActorNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True)}
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
self.heads_parameters = [DDPGActorHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.0001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class DDPGAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
self.rate_for_copying_weights_to_target = 0.001
self.num_consecutive_playing_steps = EnvironmentSteps(1)
self.use_target_network_for_evaluation = False
self.action_penalty = 0
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
self.use_non_zero_discount_for_terminal_states = False
class DDPGAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DDPGAlgorithmParameters(),
exploration=OUProcessParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"actor": DDPGActorNetworkParameters(),
"critic": DDPGCriticNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.ddpg_agent:DDPGAgent'
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
class DDPGAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.q_values = self.register_signal("Q")
self.TD_targets_signal = self.register_signal("TD targets")
self.action_signal = self.register_signal("actions")
def learn_from_batch(self, batch):
actor = self.networks['actor']
critic = self.networks['critic']
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# TD error = r + discount*max(q_st_plus_1) - q_st
next_actions, actions_mean = actor.parallel_prediction([
(actor.target_network, batch.next_states(actor_keys)),
(actor.online_network, batch.states(actor_keys))
])
critic_inputs = copy.copy(batch.next_states(critic_keys))
critic_inputs['action'] = next_actions
q_st_plus_1 = critic.target_network.predict(critic_inputs)
# calculate the bootstrapped TD targets while discounting terminal states according to
# use_non_zero_discount_for_terminal_states
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
else:
TD_targets = batch.rewards(expand_dims=True) + \
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
# clip the TD targets to prevent overestimation errors
if self.ap.algorithm.clip_critic_targets:
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
self.TD_targets_signal.add_sample(TD_targets)
# get the gradients of the critic output with respect to the action
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = actions_mean
action_gradients = critic.online_network.predict(critic_inputs,
outputs=critic.online_network.gradients_wrt_inputs[0]['action'])
# train the critic
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
# apply the gradients from the critic to the actor
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
gradients = actor.online_network.predict(batch.states(actor_keys),
outputs=actor.online_network.weighted_gradients[0],
initial_feed_dict=initial_feed_dict)
if actor.has_global:
actor.apply_gradients_to_global_network(gradients)
actor.update_online_network()
else:
actor.apply_gradients_to_online_network(gradients)
return total_loss, losses, unclipped_grads
def train(self):
return Agent.train(self)
def choose_action(self, curr_state):
if not (isinstance(self.spaces.action, BoxActionSpace) or isinstance(self.spaces.action, GoalsSpace)):
raise ValueError("DDPG works only for continuous control problems")
# convert to batch so we can run it through the network
tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
if self.ap.algorithm.use_target_network_for_evaluation:
actor_network = self.networks['actor'].target_network
else:
actor_network = self.networks['actor'].online_network
action_values = actor_network.predict(tf_input_state).squeeze()
action = self.exploration_policy.get_action(action_values)
self.action_signal.add_sample(action)
# get q value
tf_input_state = self.prepare_batch_for_inference(curr_state, 'critic')
action_batch = np.expand_dims(action, 0)
if type(action) != np.ndarray:
action_batch = np.array([[action]])
tf_input_state['action'] = action_batch
q_value = self.networks['critic'].online_network.predict(tf_input_state)[0]
self.q_values.add_sample(q_value)
action_info = ActionInfo(action=action,
action_value=q_value)
return action_info

View File

@@ -0,0 +1,69 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.schedules import LinearSchedule
from rl_coach.agents.dqn_agent import DQNAgentParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.core_types import EnvironmentSteps
class DDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.exploration.evaluation_epsilon = 0.001
@property
def path(self):
return 'rl_coach.agents.ddqn_agent:DDQNAgent'
# Double DQN - https://arxiv.org/abs/1509.06461
class DDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# initialize with the current prediction so that we will
# only update the action that we have actually done in this transition
TD_errors = []
for i in range(self.ap.network_wrappers['main'].batch_size):
new_target = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
TD_targets[i, batch.actions()[i]] = new_target
# update errors in prioritized replay buffer
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
importance_weights=importance_weights)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,219 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from enum import Enum
from typing import Union
import numpy as np
from rl_coach.agents.agent import Agent
from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
from rl_coach.architectures.tensorflow_components.heads.measurements_prediction_head import MeasurementsPredictionHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
InputEmbedderParameters, MiddlewareScheme
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace
from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class HandlingTargetsAfterEpisodeEnd(Enum):
LastStep = 0
NAN = 1
class DFPNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
'goal': InputEmbedderParameters(activation_function='leaky_relu')}
self.input_embedders_parameters['observation'].scheme = [
Conv2d([32, 8, 4]),
Conv2d([64, 4, 2]),
Conv2d([64, 3, 1]),
Dense([512]),
]
self.input_embedders_parameters['measurements'].scheme = [
Dense([128]),
Dense([128]),
Dense([128]),
]
self.input_embedders_parameters['goal'].scheme = [
Dense([128]),
Dense([128]),
Dense([128]),
]
self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
scheme=MiddlewareScheme.Empty)
self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
self.loss_weights = [1.0]
self.async_training = False
self.batch_size = 64
self.adam_optimizer_beta1 = 0.95
class DFPMemoryParameters(EpisodicExperienceReplayParameters):
def __init__(self):
self.max_size = (MemoryGranularity.Transitions, 20000)
self.shared_memory = True
super().__init__()
class DFPAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_predicted_steps_ahead = 6
self.goal_vector = [1.0, 1.0]
self.future_measurements_weights = [0.5, 0.5, 1.0]
self.use_accumulated_reward_as_measurement = False
self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
self.scale_measurements_targets = {}
self.num_consecutive_playing_steps = EnvironmentSteps(8)
class DFPAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DFPAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=DFPMemoryParameters(),
networks={"main": DFPNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.dfp_agent:DFPAgent'
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
class DFPAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.current_goal = self.ap.algorithm.goal_vector
self.target_measurements_scale_factors = None
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
network_inputs = batch.states(network_keys)
network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
self.ap.network_wrappers['main'].batch_size, axis=0)
# get the current outputs of the network
targets = self.networks['main'].online_network.predict(network_inputs)
# change the targets for the taken actions
for i in range(self.ap.network_wrappers['main'].batch_size):
targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()
result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def choose_action(self, curr_state):
if self.exploration_policy.requires_action_values():
# predict the future measurements
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
action_values = np.zeros(len(self.spaces.action.actions))
num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)
# calculate the score of each action by multiplying it's future measurements with the goal vector
for action_idx in range(len(self.spaces.action.actions)):
action_measurements = measurements_future_prediction[action_idx]
action_measurements = np.reshape(action_measurements,
(self.ap.algorithm.num_predicted_steps_ahead,
self.spaces.state['measurements'].shape[0]))
future_steps_values = np.dot(action_measurements, self.current_goal)
action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
self.ap.algorithm.future_measurements_weights)
else:
action_values = None
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
action = self.exploration_policy.get_action(action_values)
if action_values is not None:
action_values = action_values.squeeze()
action_info = ActionInfo(action=action, action_value=action_values[action])
else:
action_info = ActionInfo(action=action)
return action_info
def set_environment_parameters(self, spaces: SpacesDefinition):
self.spaces = copy.deepcopy(spaces)
self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
measurements_names=
self.spaces.state['measurements'].measurements_names)
# if the user has filled some scale values, check that he got the names right
if set(self.spaces.state['measurements'].measurements_names).intersection(
self.ap.algorithm.scale_measurements_targets.keys()) !=\
set(self.ap.algorithm.scale_measurements_targets.keys()):
raise ValueError("Some of the keys in parameter scale_measurements_targets ({}) are not defined in "
"the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
self.spaces.state['measurements'].measurements_names))
super().set_environment_parameters(self.spaces)
# the below is done after calling the base class method, as it might add accumulated reward as a measurement
# fill out the missing measurements scale factors
for measurement_name in self.spaces.state['measurements'].measurements_names:
if measurement_name not in self.ap.algorithm.scale_measurements_targets:
self.ap.algorithm.scale_measurements_targets[measurement_name] = 1
self.target_measurements_scale_factors = \
np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
self.spaces.state['measurements'].measurements_names])
def handle_episode_ended(self):
last_episode = self.current_episode_buffer
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
self._update_measurements_targets(last_episode,
self.ap.algorithm.num_predicted_steps_ahead)
super().handle_episode_ended()
def _update_measurements_targets(self, episode, num_steps):
if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
raise ValueError("Measurements are not present in the transitions of the last episode played. ")
measurements_size = self.spaces.state['measurements'].shape[0]
for transition_idx, transition in enumerate(episode.transitions):
transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
for step in range(num_steps):
offset_idx = transition_idx + 2 ** step
if offset_idx >= episode.length():
if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
# the special MSE loss will ignore those entries so that the gradient will be 0 for these
transition.info['future_measurements'][step] = np.nan
continue
elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
offset_idx = - 1
transition.info['future_measurements'][step] = \
self.target_measurements_scale_factors * \
(episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])

View File

@@ -0,0 +1,99 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
InputEmbedderParameters, MiddlewareScheme
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.core_types import EnvironmentSteps
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class DQNAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
self.num_consecutive_playing_steps = EnvironmentSteps(4)
self.discount = 0.99
class DQNNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
self.heads_parameters = [QHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = True
self.create_target_network = True
class DQNAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DQNAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=ExperienceReplayParameters(),
networks={"main": DQNNetworkParameters()})
self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
self.exploration.evaluation_epsilon = 0.05
@property
def path(self):
return 'rl_coach.agents.dqn_agent:DQNAgent'
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
class DQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the action we actually took, the error is:
# TD error = r + discount*max(q_st_plus_1) - q_st
# # for all other actions, the error is 0
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# only update the action that we have actually done in this transition
TD_errors = []
for i in range(self.ap.network_wrappers['main'].batch_size):
new_target = batch.rewards()[i] +\
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
TD_targets[i, batch.actions()[i]] = new_target
# update errors in prioritized replay buffer
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
importance_weights=importance_weights)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,108 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
import copy
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
from rl_coach.core_types import RunPhase
from rl_coach.spaces import SpacesDefinition
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
def __init__(self):
super().__init__()
self.time_limit = 40
self.sub_goal_testing_rate = 0.5
class HACDDPGAgentParameters(DDPGAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = HACDDPGAlgorithmParameters()
@property
def path(self):
return 'rl_coach.agents.hac_ddpg_agent:HACDDPGAgent'
# Hierarchical Actor Critic Generating Subgoals DDPG Agent - https://arxiv.org/pdf/1712.00948.pdf
class HACDDPGAgent(DDPGAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
self.graph_manager = None
def choose_action(self, curr_state):
# top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
# testing phase
graph_manager = self.parent_level_manager.parent_graph_manager
if self.ap.is_a_highest_level_agent:
graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
if self.phase == RunPhase.TRAIN:
if graph_manager.should_test_current_sub_goal:
self.exploration_policy.change_phase(RunPhase.TEST)
else:
self.exploration_policy.change_phase(self.phase)
action_info = super().choose_action(curr_state)
return action_info
def update_transition_before_adding_to_replay_buffer(self, transition):
graph_manager = self.parent_level_manager.parent_graph_manager
# deal with goals given from a higher level agent
if not self.ap.is_a_highest_level_agent:
transition.state['desired_goal'] = self.current_hrl_goal
transition.next_state['desired_goal'] = self.current_hrl_goal
# TODO: allow setting goals which are not part of the state. e.g. state-embedding using get_prediction
self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
self.current_hrl_goal, transition.next_state))
goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
self.current_hrl_goal, transition.next_state)
transition.reward = goal_reward
transition.game_over = transition.game_over or sub_goal_reached
# each level tests its own generated sub goals
if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
#TODO-fixme
# _, sub_goal_reached = self.parent_level_manager.environment.agents['agent_1'].spaces.goal.\
# get_reward_for_goal_and_state(transition.action, transition.next_state)
_, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
transition.action, transition.next_state)
sub_goal_is_missed = not sub_goal_reached
if sub_goal_is_missed:
transition.reward = -self.ap.algorithm.time_limit
return transition
def set_environment_parameters(self, spaces: SpacesDefinition):
super().set_environment_parameters(spaces)
if self.ap.is_a_highest_level_agent:
# the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
# their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
self.spaces.goal = self.spaces.action
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
if not self.ap.is_a_highest_level_agent:
self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward

View File

@@ -0,0 +1,115 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from collections import OrderedDict
from typing import Union
import pygame
from rl_coach.agents.agent import Agent
from rl_coach.agents.bc_agent import BCNetworkParameters
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, InputEmbedderParameters, EmbedderScheme, \
AgentParameters
from rl_coach.core_types import ActionInfo
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from pandas import to_pickle
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.logger import screen
class HumanAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
class HumanNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
class HumanAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=HumanAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": BCNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.human_agent:HumanAgent'
class HumanAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.clock = pygame.time.Clock()
self.max_fps = int(self.ap.visualization.max_fps_for_human_control)
self.env = None
def init_environment_dependent_modules(self):
super().init_environment_dependent_modules()
self.env = self.parent_level_manager._real_environment
screen.log_title("Human Control Mode")
available_keys = self.env.get_available_keys()
if available_keys:
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
screen.log("")
for action, key in self.env.get_available_keys():
screen.log("\t- {}: {}".format(action, key))
screen.separator()
def train(self):
return 0
def choose_action(self, curr_state):
action = ActionInfo(self.env.get_action_from_user(), action_value=0)
action = self.output_filter.reverse_filter(action)
# keep constant fps
self.clock.tick(self.max_fps)
if not self.env.renderer.is_open:
self.save_replay_buffer_and_exit()
return action
def save_replay_buffer_and_exit(self):
replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p')
self.memory.tp = None
to_pickle(self.memory, replay_buffer_path)
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
exit()
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Episode"] = self.current_episode
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
screen.log_dict(log, prefix="Recording")

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from typing import Union
from rl_coach.core_types import RunPhase, ActionInfo
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.agents.agent import Agent
from rl_coach.logger import screen
## This is an abstract agent - there is no learn_from_batch method ##
# Imitation Agent
class ImitationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.imitation = True
def extract_action_values(self, prediction):
return prediction.squeeze()
def choose_action(self, curr_state):
# convert to batch so we can run it through the network
prediction = self.networks['main'].online_network.predict(self.prepare_batch_for_inference(curr_state, 'main'))
# get action values and extract the best action from it
action_values = self.extract_action_values(prediction)
if type(self.spaces.action) == DiscreteActionSpace:
# DISCRETE
self.exploration_policy.phase = RunPhase.TEST
action = self.exploration_policy.get_action(action_values)
action_info = ActionInfo(action=action,
action_probability=action_values[action])
else:
# CONTINUOUS
action = action_values
action_info = ActionInfo(action=action)
return action_info
def log_to_screen(self):
# log to screen
if self.phase == RunPhase.TRAIN:
# for the training phase - we log during the episode to visualize the progress in training
log = OrderedDict()
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Loss"] = self.loss.values[-1]
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix="Training")
else:
# for the evaluation phase - logging as in regular RL
super().log_to_screen()
def learn_from_batch(self, batch):
raise NotImplementedError("ImitationAgent is an abstract agent. Not to be used directly.")

View File

@@ -0,0 +1,72 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.monte_carlo_mixing_rate = 0.1
class MixedMonteCarloAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = MixedMonteCarloAlgorithmParameters()
self.memory = EpisodicExperienceReplayParameters()
@property
def path(self):
return 'rl_coach.agents.mmc_agent:MixedMonteCarloAgent'
class MixedMonteCarloAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the 1-step, we use the double-dqn target. hence actions are taken greedily according to the online network
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
# TD_targets are initialized with the current prediction so that we will
# only update the action that we have actually done in this transition
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
for i in range(self.ap.network_wrappers['main'].batch_size):
one_step_target = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
q_st_plus_1[i][selected_actions[i]]
monte_carlo_target = batch.total_returns()[i]
TD_targets[i, batch.actions()[i]] = (1 - self.mixing_rate) * one_step_target + \
self.mixing_rate * monte_carlo_target
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
InputEmbedderParameters
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.utils import last_sample
from rl_coach.core_types import EnvironmentSteps
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class NStepQNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [QHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.async_training = True
self.shared_optimizer = True
self.create_target_network = True
class NStepQAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
self.apply_gradients_every_x_episodes = 1
self.num_steps_between_gradient_updates = 5 # this is called t_max in all the papers
self.targets_horizon = 'N-Step'
class NStepQAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NStepQAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=SingleEpisodeBufferParameters(),
networks={"main": NStepQNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.n_step_q_agent:NStepQAgent'
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.last_gradient_update_step_idx = 0
self.q_values = self.register_signal('Q Values')
self.value_loss = self.register_signal('Value Loss')
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
state_value_head_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
# the targets for the state value estimator
if self.ap.algorithm.targets_horizon == '1-Step':
# 1-Step Q learning
q_st_plus_1 = self.networks['main'].target_network.predict(batch.next_states(network_keys))
for i in reversed(range(batch.size)):
state_value_head_targets[i][batch.actions()[i]] = \
batch.rewards()[i] \
+ (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
elif self.ap.algorithm.targets_horizon == 'N-Step':
# N-Step Q learning
if batch.game_overs()[-1]:
R = 0
else:
R = np.max(self.networks['main'].target_network.predict(last_sample(batch.next_states(network_keys))))
for i in reversed(range(batch.size)):
R = batch.rewards()[i] + self.ap.algorithm.discount * R
state_value_head_targets[i][batch.actions()[i]] = R
else:
assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
# train
result = self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys), [state_value_head_targets])
# logging
total_loss, losses, unclipped_grads = result[:3]
self.value_loss.add_sample(losses[0])
return total_loss, losses, unclipped_grads
def train(self):
# update the target network of every network that has a target network
if any([network.has_target for network in self.networks.values()]) \
and self._should_update_online_weights_to_target():
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
self.agent_logger.create_signal_value('Update Target Network', 1)
else:
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
return PolicyOptimizationAgent.train(self)

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.naf_head import NAFHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, \
NetworkParameters, InputEmbedderParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import BoxActionSpace
from rl_coach.core_types import ActionInfo, EnvironmentSteps
from rl_coach.exploration_policies.ou_process import OUProcessParameters
class NAFNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [NAFHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.learning_rate = 0.001
self.async_training = True
self.create_target_network = True
class NAFAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_consecutive_training_steps = 5
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
self.rate_for_copying_weights_to_target = 0.001
class NAFAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NAFAlgorithmParameters(),
exploration=OUProcessParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": NAFNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.naf_agent:NAFAgent'
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
class NAFAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.l_values = self.register_signal("L")
self.a_values = self.register_signal("Advantage")
self.mu_values = self.register_signal("Action")
self.v_values = self.register_signal("V")
self.TD_targets = self.register_signal("TD targets")
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# TD error = r + discount*v_st_plus_1 - q_st
v_st_plus_1 = self.networks['main'].target_network.predict(
batch.next_states(network_keys),
self.networks['main'].target_network.output_heads[0].V,
squeeze_output=False,
)
TD_targets = np.expand_dims(batch.rewards(), -1) + \
(1.0 - np.expand_dims(batch.game_overs(), -1)) * self.ap.algorithm.discount * v_st_plus_1
self.TD_targets.add_sample(TD_targets)
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_0_0': batch.actions(len(batch.actions().shape) == 1)
}, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def choose_action(self, curr_state):
if type(self.spaces.action) != BoxActionSpace:
raise ValueError('NAF works only for continuous control problems')
# convert to batch so we can run it through the network
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
naf_head = self.networks['main'].online_network.output_heads[0]
action_values = self.networks['main'].online_network.predict(tf_input_state, outputs=naf_head.mu,
squeeze_output=False)
# get the actual action to use
action = self.exploration_policy.get_action(action_values)
# get the internal values for logging
outputs = [naf_head.mu, naf_head.Q, naf_head.L, naf_head.A, naf_head.V]
result = self.networks['main'].online_network.predict(
{**tf_input_state, 'output_0_0': action_values},
outputs=outputs
)
mu, Q, L, A, V = result
# store the q values statistics for logging
self.q_values.add_sample(Q)
self.l_values.add_sample(L)
self.a_values.add_sample(A)
self.mu_values.add_sample(mu)
self.v_values.add_sample(V)
action_info = ActionInfo(action=action, action_value=Q)
return action_info

View File

@@ -0,0 +1,176 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import pickle
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.dnd_q_head import DNDQHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
InputEmbedderParameters
from rl_coach.core_types import RunPhase, EnvironmentSteps, Episode, StateType
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, MemoryGranularity
from rl_coach.schedules import ConstantSchedule
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.logger import screen
class NECNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [DNDQHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
class NECAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.dnd_size = 500000
self.l2_norm_added_delta = 0.001
self.new_value_shift_coefficient = 0.1
self.number_of_knn = 50
self.DND_key_error_threshold = 0
self.num_consecutive_playing_steps = EnvironmentSteps(4)
self.propagate_updates_to_DND = False
self.n_step = 100
self.bootstrap_total_return_from_old_policy = True
class NECMemoryParameters(EpisodicExperienceReplayParameters):
def __init__(self):
super().__init__()
self.max_size = (MemoryGranularity.Transitions, 100000)
class NECAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NECAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=NECMemoryParameters(),
networks={"main": NECNetworkParameters()})
self.exploration.epsilon_schedule = ConstantSchedule(0.1)
self.exploration.evaluation_epsilon = 0.01
@property
def path(self):
return 'rl_coach.agents.nec_agent:NECAgent'
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
class NECAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.current_episode_state_embeddings = []
self.training_started = False
self.current_episode_buffer = \
Episode(discount=self.ap.algorithm.discount,
n_step=self.ap.algorithm.n_step,
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
def learn_from_batch(self, batch):
if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn):
return 0, [], 0
else:
if not self.training_started:
self.training_started = True
screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
# only update the action that we have actually done in this transition
for i in range(self.ap.network_wrappers['main'].batch_size):
TD_targets[i, batch.actions()[i]] = batch.total_returns()[i]
# set the gradients to fetch for the DND update
fetches = []
head = self.networks['main'].online_network.output_heads[0]
if self.ap.algorithm.propagate_updates_to_DND:
fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices]
# train the neural network
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches)
total_loss, losses, unclipped_grads = result[:3]
# update the DND keys and values using the extracted gradients
if self.ap.algorithm.propagate_updates_to_DND:
embedding_gradients = np.swapaxes(result[-1][0], 0, 1)
value_gradients = np.swapaxes(result[-1][1], 0, 1)
indices = np.swapaxes(result[-1][2], 0, 1)
head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices)
return total_loss, losses, unclipped_grads
def act(self):
if self.phase == RunPhase.HEATUP:
# get embedding in heatup (otherwise we get it through get_prediction)
embedding = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(self.curr_state, 'main'),
outputs=self.networks['main'].online_network.state_embedding)
self.current_episode_state_embeddings.append(embedding)
return super().act()
def get_all_q_values_for_states(self, states: StateType):
# we need to store the state embeddings regardless if the action is random or not
return self.get_prediction(states)
def get_prediction(self, states):
# get the actions q values and the state embedding
embedding, actions_q_values = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(states, 'main'),
outputs=[self.networks['main'].online_network.state_embedding,
self.networks['main'].online_network.output_heads[0].output]
)
if self.phase != RunPhase.TEST:
# store the state embedding for inserting it to the DND later
self.current_episode_state_embeddings.append(embedding.squeeze())
actions_q_values = actions_q_values[0][0]
return actions_q_values
def reset_internal_state(self):
super().reset_internal_state()
self.current_episode_state_embeddings = []
self.current_episode_buffer = \
Episode(discount=self.ap.algorithm.discount,
n_step=self.ap.algorithm.n_step,
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
def handle_episode_ended(self):
super().handle_episode_ended()
# get the last full episode that we have collected
episode = self.call_memory('get_last_complete_episode')
if episode is not None and self.phase != RunPhase.TEST:
assert len(self.current_episode_state_embeddings) == episode.length()
returns = episode.get_transitions_attribute('total_return')
actions = episode.get_transitions_attribute('action')
self.networks['main'].online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
actions, returns)
def save_checkpoint(self, checkpoint_id):
with open(os.path.join(self.ap.task_parameters.save_checkpoint_dir, str(checkpoint_id) + '.dnd'), 'wb') as f:
pickle.dump(self.networks['main'].online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL)

View File

@@ -0,0 +1,94 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay, \
EpisodicExperienceReplayParameters
class PALAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.pal_alpha = 0.9
self.persistent_advantage_learning = False
self.monte_carlo_mixing_rate = 0.1
class PALAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = PALAlgorithmParameters()
self.memory = EpisodicExperienceReplayParameters()
@property
def path(self):
return 'rl_coach.agents.pal_agent:PALAgent'
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
class PALAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.alpha = agent_parameters.algorithm.pal_alpha
self.persistent = agent_parameters.algorithm.persistent_advantage_learning
self.monte_carlo_mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# next state values
q_st_plus_1_target, q_st_plus_1_online = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.next_states(network_keys))
])
selected_actions = np.argmax(q_st_plus_1_online, 1)
v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
# current state values
q_st_target, q_st_online = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
v_st_target = np.max(q_st_target, 1)
# calculate TD error
TD_targets = np.copy(q_st_online)
for i in range(self.ap.network_wrappers['main'].batch_size):
TD_targets[i, batch.actions()[i]] = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
q_st_plus_1_target[i][selected_actions[i]]
advantage_learning_update = v_st_target[i] - q_st_target[i, batch.actions()[i]]
next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
# Persistent Advantage Learning or Regular Advantage Learning
if self.persistent:
TD_targets[i, batch.actions()[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
else:
TD_targets[i, batch.actions()[i]] -= self.alpha * advantage_learning_update
# mixing monte carlo updates
monte_carlo_target = batch.total_returns()[i]
TD_targets[i, batch.actions()[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, batch.actions()[i]] \
+ self.monte_carlo_mixing_rate * monte_carlo_target
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,105 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.logger import screen
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class PolicyGradientNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.async_training = True
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
self.apply_gradients_every_x_episodes = 5
self.beta_entropy = 0
self.num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
class PolicyGradientsAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=SingleEpisodeBufferParameters(),
networks={"main": PolicyGradientNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.policy_gradients_agent:PolicyGradientsAgent'
class PolicyGradientsAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.returns_mean = self.register_signal('Returns Mean')
self.returns_variance = self.register_signal('Returns Variance')
self.last_gradient_update_step_idx = 0
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
total_returns = batch.total_returns()
for i in reversed(range(batch.size)):
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
total_returns[i] = total_returns[0]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
# just take the total return as it is
pass
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
if self.std_discounted_return != 0:
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
else:
total_returns[i] = 0
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
targets = total_returns
actions = batch.actions()
if type(self.spaces.action) != DiscreteActionSpace and len(actions.shape) < 2:
actions = np.expand_dims(actions, -1)
self.returns_mean.add_sample(np.mean(total_returns))
self.returns_variance.add_sample(np.std(total_returns))
result = self.networks['main'].online_network.accumulate_gradients(
{**batch.states(network_keys), 'output_0_0': actions}, targets
)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,166 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from enum import Enum
from typing import Union
import numpy as np
from rl_coach.core_types import Batch, ActionInfo
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
from rl_coach.utils import eps
from rl_coach.agents.agent import Agent
from rl_coach.logger import screen
class PolicyGradientRescaler(Enum):
TOTAL_RETURN = 0
FUTURE_RETURN = 1
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined
Q_VALUE = 4
A_VALUE = 5
TD_RESIDUAL = 6
DISCOUNTED_TD_RESIDUAL = 7
GAE = 8
## This is an abstract agent - there is no learn_from_batch method ##
class PolicyOptimizationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.policy_gradient_rescaler = None
if hasattr(self.ap.algorithm, 'policy_gradient_rescaler'):
self.policy_gradient_rescaler = self.ap.algorithm.policy_gradient_rescaler
# statistics for variance reduction
self.last_gradient_update_step_idx = 0
self.max_episode_length = 100000
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
self.entropy = self.register_signal('Entropy')
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Name"] = self.full_name_id
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix=self.phase.value)
def update_episode_statistics(self, episode):
episode_discounted_returns = []
for i in range(episode.length()):
transition = episode.get_transition(i)
episode_discounted_returns.append(transition.total_return)
self.num_episodes_where_step_has_been_seen[i] += 1
self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
self.num_episodes_where_step_has_been_seen[i]
self.mean_return_over_multiple_episodes[i] += transition.total_return / \
self.num_episodes_where_step_has_been_seen[i]
self.mean_discounted_return = np.mean(episode_discounted_returns)
self.std_discounted_return = np.std(episode_discounted_returns)
def get_current_episode(self):
# we get the episode most of the time from the current episode buffer and only in the last transition from the
# "memory" (where is was stored in the end of the episode)
return self.memory.get_episode(0) or self.current_episode_buffer
def train(self):
episode = self.get_current_episode()
# check if we should calculate gradients or skip
episode_ended = episode.is_complete
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
if not (is_t_max_steps_passed or episode_ended):
return 0
total_loss = 0
if num_steps_passed_since_last_update > 0:
# we need to update the returns of the episode until now
episode.update_returns()
# get t_max transitions or less if the we got to a terminal state
# will be used for both actor-critic and vanilla PG.
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
transitions = []
start_idx = self.last_gradient_update_step_idx
end_idx = episode.length()
for idx in range(start_idx, end_idx):
transitions.append(episode.get_transition(idx))
self.last_gradient_update_step_idx = end_idx
# update the statistics for the variance reduction techniques
if self.policy_gradient_rescaler in \
[PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
self.update_episode_statistics(episode)
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
batch = Batch(transitions)
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
for network in self.networks.values():
network.apply_gradients_and_sync_networks()
self.training_iteration += 1
# move the pointer to the next episode start and discard the episode.
if episode_ended:
# we need to remove the episode, because the next training iteration will be called before storing any
# additional transitions in the memory (we don't store a transition for the first call to observe), so the
# length of the memory won't be enforced and the old episode won't be removed
self.call_memory('remove_episode', 0)
self.last_gradient_update_step_idx = 0
return total_loss
def learn_from_batch(self, batch):
raise NotImplementedError("PolicyOptimizationAgent is an abstract agent. Not to be used directly.")
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)
def choose_action(self, curr_state):
# convert to batch so we can run it through the network
action_values = self.get_prediction(curr_state)
if isinstance(self.spaces.action, DiscreteActionSpace):
# DISCRETE
action_probabilities = np.array(action_values).squeeze()
action = self.exploration_policy.get_action(action_probabilities)
action_info = ActionInfo(action=action,
action_probability=action_probabilities[action])
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
elif isinstance(self.spaces.action, BoxActionSpace):
# CONTINUOUS
action = self.exploration_policy.get_action(action_values)
action_info = ActionInfo(action=action)
else:
raise ValueError("The action space of the environment is not compatible with the algorithm")
return action_info

View File

@@ -0,0 +1,338 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from collections import OrderedDict
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters, DistributedTaskParameters
from rl_coach.core_types import EnvironmentSteps, Batch
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import force_list
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
from rl_coach.logger import screen
class PPOCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [VHeadParameters()]
self.loss_weights = [1.0]
self.async_training = True
self.l2_regularization = 0
self.create_target_network = True
self.batch_size = 128
class PPOActorNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [PPOHeadParameters()]
self.optimizer_type = 'Adam'
self.loss_weights = [1.0]
self.async_training = True
self.l2_regularization = 0
self.create_target_network = True
self.batch_size = 128
class PPOAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
self.gae_lambda = 0.96
self.target_kl_divergence = 0.01
self.initial_kl_coefficient = 1.0
self.high_kl_penalty_coefficient = 1000
self.clip_likelihood_ratio_using_epsilon = None
self.value_targets_mix_fraction = 0.1
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.use_kl_regularization = True
self.beta_entropy = 0.01
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
class PPOAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=PPOAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.ppo_agent:PPOAgent'
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
class PPOAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.kl_divergence = self.register_signal('KL Divergence')
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = self.register_signal('Grads (unclipped)')
def fill_advantages(self, batch):
batch = Batch(batch)
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# * Found not to have any impact *
# current_states_with_timestep = self.concat_state_and_timestep(batch)
current_state_values = self.networks['critic'].online_network.predict(batch.states(network_keys)).squeeze()
# calculate advantages
advantages = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
advantages = batch.total_returns() - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
# current_state_values[batch.game_overs()] = 0
for idx, game_over in enumerate(batch.game_overs()):
if game_over:
# get advantages for the rollout
value_bootstrapping = np.zeros((1,))
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
rollout_advantages, _ = \
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
rollout_state_values)
episode_start_idx = idx + 1
advantages = np.append(advantages, rollout_advantages)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
# TODO: this will be problematic with a shared memory
for transition, advantage in zip(self.memory.transitions, advantages):
transition.info['advantage'] = advantage
self.action_advantages.add_sample(advantages)
def train_value_network(self, dataset, epochs):
loss = []
batch = Batch(dataset)
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# * Found not to have any impact *
# add a timestep to the observation
# current_states_with_timestep = self.concat_state_and_timestep(dataset)
mix_fraction = self.ap.algorithm.value_targets_mix_fraction
for j in range(epochs):
curr_batch_size = batch.size
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
curr_batch_size = self.ap.network_wrappers['critic'].batch_size
for i in range(batch.size // curr_batch_size):
# split to batches for first order optimization techniques
current_states_batch = {
k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
for k, v in batch.states(network_keys).items()
}
total_return_batch = batch.total_returns(True)[i * curr_batch_size:(i + 1) * curr_batch_size]
old_policy_values = force_list(self.networks['critic'].target_network.predict(
current_states_batch).squeeze())
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
targets = total_return_batch
else:
current_values = self.networks['critic'].online_network.predict(current_states_batch)
targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
inputs = copy.copy(current_states_batch)
for input_index, input in enumerate(old_policy_values):
name = 'output_0_{}'.format(input_index)
if name in self.networks['critic'].online_network.inputs:
inputs[name] = input
value_loss = self.networks['critic'].online_network.accumulate_gradients(inputs, targets)
self.networks['critic'].apply_gradients_to_online_network()
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
self.networks['critic'].apply_gradients_to_global_network()
self.networks['critic'].online_network.reset_accumulated_gradients()
loss.append([value_loss[0]])
loss = np.mean(loss, 0)
return loss
def concat_state_and_timestep(self, dataset):
current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
for transition in dataset]
current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
return current_states_with_timestep
def train_policy_network(self, dataset, epochs):
loss = []
for j in range(epochs):
loss = {
'total_loss': [],
'policy_losses': [],
'unclipped_grads': [],
'fetch_result': []
}
#shuffle(dataset)
for i in range(len(dataset) // self.ap.network_wrappers['actor'].batch_size):
batch = Batch(dataset[i * self.ap.network_wrappers['actor'].batch_size:
(i + 1) * self.ap.network_wrappers['actor'].batch_size])
network_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
advantages = batch.info('advantage')
actions = batch.actions()
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
actions = np.expand_dims(actions, -1)
# get old policy probabilities and distribution
old_policy = force_list(self.networks['actor'].target_network.predict(batch.states(network_keys)))
# calculate gradients and apply on both the local policy network and on the global policy network
fetches = [self.networks['actor'].online_network.output_heads[0].kl_divergence,
self.networks['actor'].online_network.output_heads[0].entropy]
inputs = copy.copy(batch.states(network_keys))
inputs['output_0_0'] = actions
# old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
# it has just a mean. otherwise, it has both a mean and standard deviation
for input_index, input in enumerate(old_policy):
inputs['output_0_{}'.format(input_index + 1)] = input
total_loss, policy_losses, unclipped_grads, fetch_result =\
self.networks['actor'].online_network.accumulate_gradients(
inputs, [advantages], additional_fetches=fetches)
self.networks['actor'].apply_gradients_to_online_network()
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
self.networks['actor'].apply_gradients_to_global_network()
self.networks['actor'].online_network.reset_accumulated_gradients()
loss['total_loss'].append(total_loss)
loss['policy_losses'].append(policy_losses)
loss['unclipped_grads'].append(unclipped_grads)
loss['fetch_result'].append(fetch_result)
self.unclipped_grads.add_sample(unclipped_grads)
for key in loss.keys():
loss[key] = np.mean(loss[key], 0)
if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
curr_learning_rate = self.networks['critic'].online_network.get_variable_value(self.ap.learning_rate)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.ap.network_wrappers['critic'].learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
("Surrogate loss", loss['policy_losses'][0]),
("KL divergence", loss['fetch_result'][0]),
("Entropy", loss['fetch_result'][1]),
("training epoch", j),
("learning_rate", curr_learning_rate)
]),
prefix="Policy training"
)
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
self.entropy.add_sample(loss['fetch_result'][1])
self.kl_divergence.add_sample(loss['fetch_result'][0])
return loss['total_loss']
def update_kl_coefficient(self):
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
# his implementation for now because we know it works well
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
# update kl coefficient
kl_target = self.ap.algorithm.target_kl_divergence
kl_coefficient = self.networks['actor'].online_network.get_variable_value(
self.networks['actor'].online_network.output_heads[0].kl_coefficient)
new_kl_coefficient = kl_coefficient
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
# kl too high => increase regularization
new_kl_coefficient *= 1.5
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
# kl too low => decrease regularization
new_kl_coefficient /= 1.5
# update the kl coefficient variable
if kl_coefficient != new_kl_coefficient:
self.networks['actor'].online_network.set_variable_value(
self.networks['actor'].online_network.output_heads[0].assign_kl_coefficient,
new_kl_coefficient,
self.networks['actor'].online_network.output_heads[0].kl_coefficient_ph)
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
def post_training_commands(self):
if self.ap.algorithm.use_kl_regularization:
self.update_kl_coefficient()
# clean memory
self.call_memory('clean')
def train(self):
loss = 0
if self._should_train(wait_for_full_episode=True):
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
self.networks['actor'].sync()
self.networks['critic'].sync()
dataset = self.memory.transitions
self.fill_advantages(dataset)
# take only the requested number of steps
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
value_loss = self.train_value_network(dataset, 1)
policy_loss = self.train_policy_network(dataset, 10)
self.value_loss.add_sample(value_loss)
self.policy_loss.add_sample(policy_loss)
self.post_training_commands()
self.training_iteration += 1
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return np.append(value_loss, policy_loss)
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "actor")
return self.networks['actor'].online_network.predict(tf_input_state)

View File

@@ -0,0 +1,112 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.architectures.tensorflow_components.heads.quantile_regression_q_head import QuantileRegressionQHeadParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.core_types import StateType
class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.heads_parameters = [QuantileRegressionQHeadParameters()]
self.learning_rate = 0.00005
self.optimizer_epsilon = 0.01 / 32
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.atoms = 200
self.huber_loss_interval = 1 # called k in the paper
class QuantileRegressionDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = QuantileRegressionDQNAlgorithmParameters()
self.network_wrappers = {"main": QuantileRegressionDQNNetworkParameters()}
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.exploration.evaluation_epsilon = 0.001
@property
def path(self):
return 'rl_coach.agents.qr_dqn_agent:QuantileRegressionDQNAgent'
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.quantile_probabilities = np.ones(self.ap.algorithm.atoms) / float(self.ap.algorithm.atoms)
def get_q_values(self, quantile_values):
return np.dot(quantile_values, self.quantile_probabilities)
# prediction's format is (batch,actions,atoms)
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
quantile_values = self.get_prediction(states)
actions_q_values = self.get_q_values(quantile_values)
else:
actions_q_values = None
return actions_q_values
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the quantiles of the next states and current states
next_state_quantiles, current_quantiles = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# get the optimal actions to take for the next states
target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1)
# calculate the Bellman update
batch_idx = list(range(self.ap.network_wrappers['main'].batch_size))
TD_targets = batch.rewards(True) + (1.0 - batch.game_overs(True)) * self.ap.algorithm.discount \
* next_state_quantiles[batch_idx, target_actions]
# get the locations of the selected actions within the batch for indexing purposes
actions_locations = [[b, a] for b, a in zip(batch_idx, batch.actions())]
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
cumulative_probabilities = np.array(range(self.ap.algorithm.atoms + 1)) / float(self.ap.algorithm.atoms) # tau_i
quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1]) # tau^hat_i
quantile_midpoints = np.tile(quantile_midpoints, (self.ap.network_wrappers['main'].batch_size, 1))
sorted_quantiles = np.argsort(current_quantiles[batch_idx, batch.actions()])
for idx in range(self.ap.network_wrappers['main'].batch_size):
quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]]
# train
result = self.networks['main'].train_and_sync_networks({
**batch.states(network_keys),
'output_0_0': actions_locations,
'output_0_1': quantile_midpoints,
}, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.agents.agent import Agent
from rl_coach.core_types import ActionInfo, StateType
## This is an abstract agent - there is no learn_from_batch method ##
class ValueOptimizationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.q_values = self.register_signal("Q")
self.q_value_for_action = {}
def init_environment_dependent_modules(self):
super().init_environment_dependent_modules()
if isinstance(self.spaces.action, DiscreteActionSpace):
for i in range(len(self.spaces.action.actions)):
self.q_value_for_action[i] = self.register_signal("Q for action {}".format(i),
dump_one_value_per_episode=False,
dump_one_value_per_step=True)
# Algorithms for which q_values are calculated from predictions will override this function
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
actions_q_values = self.get_prediction(states)
else:
actions_q_values = None
return actions_q_values
def get_prediction(self, states):
return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'))
def update_transition_priorities_and_get_weights(self, TD_errors, batch):
# update errors in prioritized replay buffer
importance_weights = None
if isinstance(self.memory, PrioritizedExperienceReplay):
self.call_memory('update_priorities', (batch.info('idx'), TD_errors))
importance_weights = batch.info('weight')
return importance_weights
def _validate_action(self, policy, action):
if np.array(action).shape != ():
raise ValueError((
'The exploration_policy {} returned a vector of actions '
'instead of a single action. ValueOptimizationAgents '
'require exploration policies which return a single action.'
).format(policy.__class__.__name__))
def choose_action(self, curr_state):
actions_q_values = self.get_all_q_values_for_states(curr_state)
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
action = self.exploration_policy.get_action(actions_q_values)
self._validate_action(self.exploration_policy, action)
if actions_q_values is not None:
# this is for bootstrapped dqn
if type(actions_q_values) == list and len(actions_q_values) > 0:
actions_q_values = self.exploration_policy.last_action_values
actions_q_values = actions_q_values.squeeze()
# store the q values statistics for logging
self.q_values.add_sample(actions_q_values)
for i, q_value in enumerate(actions_q_values):
self.q_value_for_action[i].add_sample(q_value)
action_info = ActionInfo(action=action,
action_value=actions_q_values[action],
max_action_value=np.max(actions_q_values))
else:
action_info = ActionInfo(action=action)
return action_info
def learn_from_batch(self, batch):
raise NotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")