mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 03:30:19 +01:00
pre-release 0.10.0
This commit is contained in:
15
rl_coach/agents/__init__.py
Normal file
15
rl_coach/agents/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
165
rl_coach/agents/actor_critic_agent.py
Normal file
165
rl_coach/agents/actor_critic_agent.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.utils import last_sample
|
||||
|
||||
from rl_coach.logger import screen
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class ActorCriticAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.A_VALUE
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.beta_entropy = 0
|
||||
self.num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
|
||||
self.gae_lambda = 0.96
|
||||
self.estimate_state_value_using_gae = False
|
||||
|
||||
|
||||
class ActorCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [VHeadParameters(), PolicyHeadParameters()]
|
||||
self.loss_weights = [0.5, 1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1, 1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.clip_gradients = 40.0
|
||||
self.async_training = True
|
||||
|
||||
|
||||
class ActorCriticAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=ActorCriticAlgorithmParameters(),
|
||||
exploration=None, #TODO this should be different for continuous (ContinuousEntropyExploration)
|
||||
# and discrete (CategoricalExploration) action spaces.
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": ActorCriticNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.actor_critic_agent:ActorCriticAgent'
|
||||
|
||||
|
||||
# Actor Critic - https://arxiv.org/abs/1602.01783
|
||||
class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.action_advantages = self.register_signal('Advantages')
|
||||
self.state_values = self.register_signal('Values')
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
|
||||
# Discounting function used to calculate discounted returns.
|
||||
def discount(self, x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
def get_general_advantage_estimation_values(self, rewards, values):
|
||||
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
||||
bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
|
||||
|
||||
# Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
|
||||
# although in practice works even in much smaller Tmax values, e.g. 20)
|
||||
deltas = rewards + self.ap.algorithm.discount * values[1:] - values[:-1]
|
||||
gae = self.discount(deltas, self.ap.algorithm.discount * self.ap.algorithm.gae_lambda)
|
||||
|
||||
if self.ap.algorithm.estimate_state_value_using_gae:
|
||||
discounted_returns = np.expand_dims(gae + values[:-1], -1)
|
||||
else:
|
||||
discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
|
||||
self.ap.algorithm.discount)), 1)[:-1]
|
||||
return gae, discounted_returns
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the values for the current states
|
||||
|
||||
result = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
current_state_values = result[0]
|
||||
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# the targets for the state value estimator
|
||||
num_transitions = batch.size
|
||||
state_value_head_targets = np.zeros((num_transitions, 1))
|
||||
|
||||
# estimate the advantage function
|
||||
action_advantages = np.zeros((num_transitions, 1))
|
||||
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if batch.game_overs()[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
R = batch.rewards()[i] + self.ap.algorithm.discount * R
|
||||
state_value_head_targets[i] = R
|
||||
action_advantages[i] = R - current_state_values[i]
|
||||
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
|
||||
values = np.append(current_state_values, bootstrapped_value)
|
||||
if batch.game_overs()[-1]:
|
||||
values[-1] = 0
|
||||
|
||||
# get general discounted returns table
|
||||
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
|
||||
action_advantages = np.vstack(gae_values)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
action_advantages = action_advantages.squeeze(axis=-1)
|
||||
actions = batch.actions()
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# train
|
||||
result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
|
||||
'output_1_0': actions},
|
||||
[state_value_head_targets, action_advantages])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.action_advantages.add_sample(action_advantages)
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
self.value_loss.add_sample(losses[0])
|
||||
self.policy_loss.add_sample(losses[1])
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "main")
|
||||
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value
|
||||
791
rl_coach/agents/agent.py
Normal file
791
rl_coach/agents/agent.py
Normal file
@@ -0,0 +1,791 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import random
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, List, Union, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.agent_interface import AgentInterface
|
||||
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
|
||||
from rl_coach.core_types import RunPhase, PredictionType, EnvironmentEpisodes, ActionType, Batch, Episode, StateType
|
||||
from rl_coach.core_types import Transition, ActionInfo, TrainingSteps, EnvironmentSteps, EnvResponse
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
|
||||
from pandas import read_pickle
|
||||
from six.moves import range
|
||||
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace, GoalsSpace, AttentionActionSpace
|
||||
from rl_coach.utils import Signal, force_list, set_cpu
|
||||
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
|
||||
|
||||
from rl_coach.architectures.network_wrapper import NetworkWrapper
|
||||
from rl_coach.logger import screen, Logger, EpisodeLogger
|
||||
|
||||
|
||||
class Agent(AgentInterface):
|
||||
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
"""
|
||||
:param agent_parameters: A Preset class instance with all the running paramaters
|
||||
"""
|
||||
super().__init__()
|
||||
self.ap = agent_parameters
|
||||
self.task_id = self.ap.task_parameters.task_index
|
||||
self.is_chief = self.task_id == 0
|
||||
self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
|
||||
and self.ap.memory.shared_memory
|
||||
if self.shared_memory:
|
||||
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
|
||||
self.name = agent_parameters.name
|
||||
self.parent = parent
|
||||
self.parent_level_manager = None
|
||||
self.full_name_id = agent_parameters.full_name_id = self.name
|
||||
|
||||
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
|
||||
screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
|
||||
"tensorflow wake up time)".format(self.full_name_id, self.task_id))
|
||||
else:
|
||||
screen.log_title("Creating agent - name: {}".format(self.full_name_id))
|
||||
self.imitation = False
|
||||
self.agent_logger = Logger()
|
||||
self.agent_episode_logger = EpisodeLogger()
|
||||
|
||||
# get the memory
|
||||
# - distributed training + shared memory:
|
||||
# * is chief? -> create the memory and add it to the scratchpad
|
||||
# * not chief? -> wait for the chief to create the memory and then fetch it
|
||||
# - non distributed training / not shared memory:
|
||||
# * create memory
|
||||
memory_name = self.ap.memory.path.split(':')[1]
|
||||
self.memory_lookup_name = self.full_name_id + '.' + memory_name
|
||||
if self.shared_memory and not self.is_chief:
|
||||
self.memory = self.shared_memory_scratchpad.get(self.memory_lookup_name)
|
||||
else:
|
||||
# modules
|
||||
if agent_parameters.memory.load_memory_from_file_path:
|
||||
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
||||
.format(agent_parameters.memory.load_memory_from_file_path))
|
||||
self.memory = read_pickle(agent_parameters.memory.load_memory_from_file_path)
|
||||
else:
|
||||
self.memory = dynamic_import_and_instantiate_module_from_params(self.ap.memory)
|
||||
|
||||
if self.shared_memory and self.is_chief:
|
||||
self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory)
|
||||
|
||||
# set devices
|
||||
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
|
||||
self.has_global = True
|
||||
self.replicated_device = agent_parameters.task_parameters.device
|
||||
self.worker_device = "/job:worker/task:{}".format(self.task_id)
|
||||
else:
|
||||
self.has_global = False
|
||||
self.replicated_device = None
|
||||
self.worker_device = ""
|
||||
if agent_parameters.task_parameters.use_cpu:
|
||||
self.worker_device += "/cpu:0"
|
||||
else:
|
||||
self.worker_device += "/device:GPU:0"
|
||||
|
||||
# filters
|
||||
self.input_filter = self.ap.input_filter
|
||||
self.output_filter = self.ap.output_filter
|
||||
self.pre_network_filter = self.ap.pre_network_filter
|
||||
device = self.replicated_device if self.replicated_device else self.worker_device
|
||||
self.input_filter.set_device(device)
|
||||
self.output_filter.set_device(device)
|
||||
self.pre_network_filter.set_device(device)
|
||||
|
||||
|
||||
# initialize all internal variables
|
||||
self._phase = RunPhase.HEATUP
|
||||
self.total_shaped_reward_in_current_episode = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.total_steps_counter = 0
|
||||
self.running_reward = None
|
||||
self.training_iteration = 0
|
||||
self.last_target_network_update_step = 0
|
||||
self.last_training_phase_step = 0
|
||||
self.current_episode = self.ap.current_episode = 0
|
||||
self.curr_state = {}
|
||||
self.current_hrl_goal = None
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
self.last_episode_evaluation_ran = 0
|
||||
self.running_observations = []
|
||||
self.agent_logger.set_current_time(self.current_episode)
|
||||
self.exploration_policy = None
|
||||
self.networks = {}
|
||||
self.last_action_info = None
|
||||
self.running_observation_stats = None
|
||||
self.running_reward_stats = None
|
||||
self.accumulated_rewards_across_evaluation_episodes = 0
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
|
||||
self.num_successes_across_evaluation_episodes = 0
|
||||
self.num_evaluation_episodes_completed = 0
|
||||
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
|
||||
# TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)
|
||||
|
||||
# environment parameters
|
||||
self.spaces = None
|
||||
self.in_action_space = self.ap.algorithm.in_action_space
|
||||
|
||||
# signals
|
||||
self.episode_signals = []
|
||||
self.step_signals = []
|
||||
self.loss = self.register_signal('Loss')
|
||||
self.curr_learning_rate = self.register_signal('Learning Rate')
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
|
||||
self.shaped_reward = self.register_signal('Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.distance_from_goal = self.register_signal('Distance From Goal', dump_one_value_per_step=True)
|
||||
|
||||
# use seed
|
||||
if self.ap.task_parameters.seed is not None:
|
||||
random.seed(self.ap.task_parameters.seed)
|
||||
np.random.seed(self.ap.task_parameters.seed)
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the agent.
|
||||
Additionally, updates the full name of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
if self._parent is not None:
|
||||
if not hasattr(self._parent, 'name'):
|
||||
raise ValueError("The parent of an agent must have a name")
|
||||
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def setup_logger(self):
|
||||
# dump documentation
|
||||
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
|
||||
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
|
||||
level_name=self.parent_level_manager.name,
|
||||
agent_full_id='.'.join(self.full_name_id.split('/')))
|
||||
self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path, logger_prefix=logger_prefix,
|
||||
add_timestamp=True, task_id=self.task_id)
|
||||
if self.ap.visualization.dump_in_episode_signals:
|
||||
self.agent_episode_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,
|
||||
logger_prefix=logger_prefix,
|
||||
add_timestamp=True, task_id=self.task_id)
|
||||
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
self.input_filter.set_session(sess)
|
||||
self.output_filter.set_session(sess)
|
||||
self.pre_network_filter.set_session(sess)
|
||||
[network.set_session(sess) for network in self.networks.values()]
|
||||
|
||||
def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True,
|
||||
dump_one_value_per_step: bool=False) -> Signal:
|
||||
"""
|
||||
Register a signal such that its statistics will be dumped and be viewable through dashboard
|
||||
:param signal_name: the name of the signal as it will appear in dashboard
|
||||
:param dump_one_value_per_episode: should the signal value be written for each episode?
|
||||
:param dump_one_value_per_step: should the signal value be written for each step?
|
||||
:return: the created signal
|
||||
"""
|
||||
signal = Signal(signal_name)
|
||||
if dump_one_value_per_episode:
|
||||
self.episode_signals.append(signal)
|
||||
if dump_one_value_per_step:
|
||||
self.step_signals.append(signal)
|
||||
return signal
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
:param spaces: the environment spaces definition
|
||||
:return: None
|
||||
"""
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
|
||||
if self.ap.algorithm.use_accumulated_reward_as_measurement:
|
||||
if 'measurements' in self.spaces.state.sub_spaces:
|
||||
self.spaces.state['measurements'].shape += 1
|
||||
self.spaces.state['measurements'].measurements_names += ['accumulated_reward']
|
||||
else:
|
||||
self.spaces.state['measurements'] = VectorObservationSpace(1, measurements_names=['accumulated_reward'])
|
||||
|
||||
for observation_name in self.spaces.state.sub_spaces.keys():
|
||||
self.spaces.state[observation_name] = \
|
||||
self.pre_network_filter.get_filtered_observation_space(observation_name,
|
||||
self.input_filter.get_filtered_observation_space(observation_name,
|
||||
self.spaces.state[observation_name]))
|
||||
|
||||
self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
|
||||
self.input_filter.get_filtered_reward_space(self.spaces.reward))
|
||||
|
||||
self.spaces.action = self.output_filter.get_unfiltered_action_space(self.spaces.action)
|
||||
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
# TODO: what if the goal type is an embedding / embedding change?
|
||||
self.spaces.goal = self.in_action_space
|
||||
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
|
||||
|
||||
self.init_environment_dependent_modules()
|
||||
|
||||
def create_networks(self) -> Dict[str, NetworkWrapper]:
|
||||
"""
|
||||
Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.
|
||||
:return: A list containing all the networks
|
||||
"""
|
||||
networks = {}
|
||||
for network_name in sorted(self.ap.network_wrappers.keys()):
|
||||
networks[network_name] = NetworkWrapper(name=network_name,
|
||||
agent_parameters=self.ap,
|
||||
has_target=self.ap.network_wrappers[network_name].create_target_network,
|
||||
has_global=self.has_global,
|
||||
spaces=self.spaces,
|
||||
replicated_device=self.replicated_device,
|
||||
worker_device=self.worker_device)
|
||||
return networks
|
||||
|
||||
def init_environment_dependent_modules(self) -> None:
|
||||
"""
|
||||
Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space
|
||||
:return: None
|
||||
"""
|
||||
# initialize exploration policy
|
||||
self.ap.exploration.action_space = self.spaces.action
|
||||
self.exploration_policy = dynamic_import_and_instantiate_module_from_params(self.ap.exploration)
|
||||
|
||||
# create all the networks of the agent
|
||||
self.networks = self.create_networks()
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the phase of the run for the agent and all the sub components
|
||||
:param phase: the new run phase (TRAIN, TEST, etc.)
|
||||
:return: None
|
||||
"""
|
||||
self.reset_evaluation_state(val)
|
||||
self._phase = val
|
||||
self.exploration_policy.change_phase(val)
|
||||
|
||||
def reset_evaluation_state(self, val: RunPhase) -> None:
|
||||
starting_evaluation = (val == RunPhase.TEST)
|
||||
ending_evaluation = (self.phase == RunPhase.TEST)
|
||||
|
||||
if starting_evaluation:
|
||||
self.accumulated_rewards_across_evaluation_episodes = 0
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
|
||||
self.num_successes_across_evaluation_episodes = 0
|
||||
self.num_evaluation_episodes_completed = 0
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
screen.log_title("{}: Starting evaluation phase".format(self.name))
|
||||
|
||||
elif ending_evaluation:
|
||||
# we write to the next episode, because it could be that the current episode was already written
|
||||
# to disk and then we won't write it again
|
||||
self.agent_logger.set_current_time(self.current_episode + 1)
|
||||
self.agent_logger.create_signal_value(
|
||||
'Evaluation Reward',
|
||||
self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
|
||||
self.agent_logger.create_signal_value(
|
||||
'Shaped Evaluation Reward',
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
|
||||
success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
|
||||
self.agent_logger.create_signal_value(
|
||||
"Success Rate",
|
||||
success_rate
|
||||
)
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
screen.log_title("{}: Finished evaluation phase. Success rate = {}"
|
||||
.format(self.name, np.round(success_rate, 2)))
|
||||
|
||||
def call_memory(self, func, args=()):
|
||||
"""
|
||||
This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.
|
||||
:param func: the name of the memory function to call
|
||||
:param args: the arguments to supply to the function
|
||||
:return: the return value of the function
|
||||
"""
|
||||
if self.shared_memory:
|
||||
result = self.shared_memory_scratchpad.internal_call(self.memory_lookup_name, func, args)
|
||||
else:
|
||||
if type(args) != tuple:
|
||||
args = (args,)
|
||||
result = getattr(self.memory, func)(*args)
|
||||
return result
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
|
||||
log["Exploration"] = np.round(self.exploration_policy.get_control_param(), 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_step_in_episode_log(self):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
self.agent_episode_logger.set_current_time(self.current_episode_steps_counter)
|
||||
self.agent_episode_logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
self.agent_episode_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
|
||||
self.agent_episode_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
|
||||
self.agent_episode_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
|
||||
self.agent_episode_logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
self.agent_episode_logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||
self.agent_episode_logger.create_signal_value("Shaped Accumulated Reward", self.total_shaped_reward_in_current_episode)
|
||||
self.agent_episode_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
self.agent_episode_logger.update_wall_clock_time(self.current_episode_steps_counter)
|
||||
|
||||
for signal in self.step_signals:
|
||||
self.agent_episode_logger.create_signal_value(signal.name, signal.get_last_value())
|
||||
|
||||
# dump
|
||||
self.agent_episode_logger.dump_output_csv()
|
||||
|
||||
def update_log(self):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
self.agent_logger.set_current_time(self.current_episode)
|
||||
self.agent_logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
self.agent_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
|
||||
self.agent_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
|
||||
self.agent_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
|
||||
self.agent_logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||
self.agent_logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
self.agent_logger.create_signal_value("Epsilon", np.mean(self.exploration_policy.get_control_param()))
|
||||
self.agent_logger.create_signal_value("Shaped Training Reward", self.total_shaped_reward_in_current_episode
|
||||
if self._phase == RunPhase.TRAIN else np.nan)
|
||||
self.agent_logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
||||
if self._phase == RunPhase.TRAIN else np.nan)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
self.agent_logger.update_wall_clock_time(self.current_episode)
|
||||
|
||||
if self._phase != RunPhase.TEST:
|
||||
self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
|
||||
|
||||
|
||||
for signal in self.episode_signals:
|
||||
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
self.agent_logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||
self.agent_logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||
|
||||
# dump
|
||||
if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
|
||||
and self.current_episode > 0:
|
||||
self.agent_logger.dump_output_csv()
|
||||
|
||||
def handle_episode_ended(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode_buffer.is_complete = True
|
||||
|
||||
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
|
||||
self.current_episode += 1
|
||||
|
||||
if self.phase != RunPhase.TEST and isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.call_memory('store_episode', self.current_episode_buffer)
|
||||
|
||||
if self.phase == RunPhase.TEST:
|
||||
self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
|
||||
self.num_evaluation_episodes_completed += 1
|
||||
|
||||
if self.spaces.reward.reward_success_threshold and \
|
||||
self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
|
||||
self.num_successes_across_evaluation_episodes += 1
|
||||
|
||||
if self.ap.visualization.dump_csv:
|
||||
self.update_log()
|
||||
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
self.log_to_screen()
|
||||
|
||||
def reset_internal_state(self):
|
||||
"""
|
||||
Reset all the episodic parameters
|
||||
:return: None
|
||||
"""
|
||||
for signal in self.episode_signals:
|
||||
signal.reset()
|
||||
for signal in self.step_signals:
|
||||
signal.reset()
|
||||
self.agent_episode_logger.set_episode_idx(self.current_episode)
|
||||
self.total_shaped_reward_in_current_episode = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.curr_state = {}
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
|
||||
if self.exploration_policy:
|
||||
self.exploration_policy.reset()
|
||||
self.input_filter.reset()
|
||||
self.output_filter.reset()
|
||||
self.pre_network_filter.reset()
|
||||
if isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.call_memory('verify_last_episode_is_closed')
|
||||
|
||||
for network in self.networks.values():
|
||||
network.online_network.reset_internal_memory()
|
||||
|
||||
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
|
||||
"""
|
||||
Given a batch of transitions, calculates their target values and updates the network.
|
||||
:param batch: A list of transitions
|
||||
:return: The total loss of the training, the loss per head and the unclipped gradients
|
||||
"""
|
||||
return 0, [], []
|
||||
|
||||
def _should_update_online_weights_to_target(self):
|
||||
"""
|
||||
Determine if online weights should be copied to the target.
|
||||
:return: boolean: True if the online weights should be copied to the target.
|
||||
"""
|
||||
# update the target network of every network that has a target network
|
||||
step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
|
||||
if step_method.__class__ == TrainingSteps:
|
||||
should_update = (self.training_iteration - self.last_target_network_update_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_target_network_update_step = self.training_iteration
|
||||
elif step_method.__class__ == EnvironmentSteps:
|
||||
should_update = (self.total_steps_counter - self.last_target_network_update_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_target_network_update_step = self.total_steps_counter
|
||||
else:
|
||||
raise ValueError("The num_steps_between_copying_online_weights_to_target parameter should be either "
|
||||
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def _should_train(self, wait_for_full_episode=False):
|
||||
"""
|
||||
Determine if we should start a training phase according to the number of steps passed since the last training
|
||||
:return: boolean: True if we should start a training phase
|
||||
"""
|
||||
step_method = self.ap.algorithm.num_consecutive_playing_steps
|
||||
if step_method.__class__ == EnvironmentEpisodes:
|
||||
should_update = (self.current_episode - self.last_training_phase_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_training_phase_step = self.current_episode
|
||||
elif step_method.__class__ == EnvironmentSteps:
|
||||
should_update = (self.total_steps_counter - self.last_training_phase_step) >= step_method.num_steps
|
||||
if wait_for_full_episode:
|
||||
should_update = should_update and self.current_episode_steps_counter == 0
|
||||
if should_update:
|
||||
self.last_training_phase_step = self.total_steps_counter
|
||||
else:
|
||||
raise ValueError("The num_consecutive_playing_steps parameter should be either "
|
||||
"EnvironmentSteps or Episodes. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.
|
||||
:return: The total training loss during the training iterations.
|
||||
"""
|
||||
loss = 0
|
||||
if self._should_train():
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
# TODO: this should be network dependent
|
||||
network_parameters = list(self.ap.network_wrappers.values())[0]
|
||||
|
||||
# update counters
|
||||
self.training_iteration += 1
|
||||
|
||||
# sample a batch and train on it
|
||||
batch = self.call_memory('sample', network_parameters.batch_size)
|
||||
if self.pre_network_filter is not None:
|
||||
batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False)
|
||||
|
||||
# if the batch returned empty then there are not enough samples in the replay buffer -> skip
|
||||
# training step
|
||||
if len(batch) > 0:
|
||||
# train
|
||||
batch = Batch(batch)
|
||||
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
|
||||
loss += total_loss
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
# TODO: the learning rate decay should be done through the network instead of here
|
||||
# decay learning rate
|
||||
if network_parameters.learning_rate_decay_rate != 0:
|
||||
self.curr_learning_rate.add_sample(self.networks['main'].sess.run(
|
||||
self.networks['main'].online_network.current_learning_rate))
|
||||
else:
|
||||
self.curr_learning_rate.add_sample(network_parameters.learning_rate)
|
||||
|
||||
if any([network.has_target for network in self.networks.values()]) \
|
||||
and self._should_update_online_weights_to_target():
|
||||
for network in self.networks.values():
|
||||
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
self.loss.add_sample(loss)
|
||||
|
||||
if self.imitation:
|
||||
self.log_to_screen()
|
||||
|
||||
# run additional commands after the training is done
|
||||
self.post_training_commands()
|
||||
|
||||
return loss
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
|
||||
:param curr_state: the current state to act upon.
|
||||
:return: chosen action, some action value describing the action (q-value, probability, etc)
|
||||
"""
|
||||
pass
|
||||
|
||||
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
|
||||
network_name: str):
|
||||
"""
|
||||
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.
|
||||
"""
|
||||
# convert to batch so we can run it through the network
|
||||
states = force_list(states)
|
||||
batches_dict = {}
|
||||
for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
|
||||
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
|
||||
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
|
||||
# addition to the current_state, so that all the inputs of the network will be filled)
|
||||
if key in states[0].keys():
|
||||
batches_dict[key] = np.array([np.array(state[key]) for state in states])
|
||||
|
||||
return batches_dict
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Given the agents current knowledge, decide on the next action to apply to the environment
|
||||
:return: an action and a dictionary containing any additional info from the action decision process
|
||||
"""
|
||||
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
|
||||
# This agent never plays while training (e.g. behavioral cloning)
|
||||
return None
|
||||
|
||||
# count steps (only when training or if we are in the evaluation worker)
|
||||
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# decide on the action
|
||||
if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
|
||||
# random action
|
||||
self.last_action_info = self.spaces.action.sample_with_info()
|
||||
else:
|
||||
# informed action
|
||||
if self.pre_network_filter is not None:
|
||||
# before choosing an action, first use the pre_network_filter to filter out the current state
|
||||
curr_state = self.run_pre_network_filter_for_inference(self.curr_state)
|
||||
|
||||
else:
|
||||
curr_state = self.curr_state
|
||||
self.last_action_info = self.choose_action(curr_state)
|
||||
|
||||
filtered_action_info = self.output_filter.filter(self.last_action_info)
|
||||
|
||||
return filtered_action_info
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
|
||||
|
||||
def get_state_embedding(self, state: dict) -> np.ndarray:
|
||||
"""
|
||||
Given a state, get the corresponding state embedding from the main network
|
||||
:param state: a state dict
|
||||
:return: a numpy embedding vector
|
||||
"""
|
||||
# TODO: this won't work anymore
|
||||
# TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
|
||||
embedding = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(state, "main"),
|
||||
outputs=self.networks['main'].online_network.state_embedding)
|
||||
return embedding
|
||||
|
||||
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
|
||||
"""
|
||||
Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
:param transition: the transition to update
|
||||
:return: the updated transition
|
||||
"""
|
||||
return transition
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.
|
||||
:param env_response: result of call from environment.step(action)
|
||||
:return:
|
||||
"""
|
||||
|
||||
# filter the env_response
|
||||
filtered_env_response = self.input_filter.filter(env_response)[0]
|
||||
|
||||
# inject agent collected statistics, if required
|
||||
if self.ap.algorithm.use_accumulated_reward_as_measurement:
|
||||
if 'measurements' in filtered_env_response.next_state:
|
||||
filtered_env_response.next_state['measurements'] = np.append(filtered_env_response.next_state['measurements'],
|
||||
self.total_shaped_reward_in_current_episode)
|
||||
else:
|
||||
filtered_env_response.next_state['measurements'] = np.array([self.total_shaped_reward_in_current_episode])
|
||||
|
||||
# if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
|
||||
# transition yet, and therefore we don't need to store anything in the memory.
|
||||
# also we did not reach the goal yet.
|
||||
if self.current_episode_steps_counter == 0:
|
||||
# initialize the current state
|
||||
self.curr_state = filtered_env_response.next_state
|
||||
return env_response.game_over
|
||||
else:
|
||||
transition = Transition(state=copy.copy(self.curr_state), action=self.last_action_info.action,
|
||||
reward=filtered_env_response.reward, next_state=filtered_env_response.next_state,
|
||||
game_over=filtered_env_response.game_over, info=filtered_env_response.info)
|
||||
|
||||
# now that we have formed a basic transition - the next state progresses to be the current state
|
||||
self.curr_state = filtered_env_response.next_state
|
||||
|
||||
# make agent specific changes to the transition if needed
|
||||
transition = self.update_transition_before_adding_to_replay_buffer(transition)
|
||||
|
||||
# merge the intrinsic reward in
|
||||
if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
|
||||
transition.reward = transition.reward * (1 + self.last_action_info.action_intrinsic_reward)
|
||||
else:
|
||||
transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward
|
||||
|
||||
# sum up the total shaped reward
|
||||
self.total_shaped_reward_in_current_episode += transition.reward
|
||||
self.total_reward_in_current_episode += env_response.reward
|
||||
self.shaped_reward.add_sample(transition.reward)
|
||||
self.reward.add_sample(env_response.reward)
|
||||
|
||||
# add action info to transition
|
||||
if type(self.parent).__name__ == 'CompositeAgent':
|
||||
transition.add_info(self.parent.last_action_info.__dict__)
|
||||
else:
|
||||
transition.add_info(self.last_action_info.__dict__)
|
||||
|
||||
# create and store the transition
|
||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
|
||||
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
|
||||
# for regular memories we insert the transitions directly to the memory
|
||||
if isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.current_episode_buffer.insert(transition)
|
||||
else:
|
||||
self.call_memory('store', transition)
|
||||
|
||||
if self.ap.visualization.dump_in_episode_signals:
|
||||
self.update_step_in_episode_log()
|
||||
|
||||
return transition.game_over
|
||||
|
||||
def post_training_commands(self):
|
||||
pass
|
||||
|
||||
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
|
||||
"""
|
||||
Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return:
|
||||
"""
|
||||
|
||||
predictions = self.networks['main'].online_network.predict_with_prediction_type(
|
||||
# states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
|
||||
states=states, prediction_type=prediction_type)
|
||||
|
||||
if len(predictions.keys()) != 1:
|
||||
raise ValueError("The network has more than one component {} matching the requested prediction_type {}. ".
|
||||
format(list(predictions.keys()), prediction_type))
|
||||
return list(predictions.values())[0]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.current_hrl_goal = action
|
||||
elif isinstance(self.in_action_space, AttentionActionSpace):
|
||||
self.input_filter.observation_filters['attention'].crop_low = action[0]
|
||||
self.input_filter.observation_filters['attention'].crop_high = action[1]
|
||||
self.output_filter.action_filters['masking'].set_masking(action[0], action[1])
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Allows agents to store additional information when saving checkpoints.
|
||||
:param checkpoint_id: the id of the checkpoint
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the global network parameters to local networks
|
||||
:return: None
|
||||
"""
|
||||
for network in self.networks.values():
|
||||
network.sync()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
125
rl_coach/agents/agent_interface.py
Normal file
125
rl_coach/agents/agent_interface.py
Normal file
@@ -0,0 +1,125 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union, List, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, PredictionType, ActionType
|
||||
|
||||
|
||||
class AgentInterface(object):
|
||||
def __init__(self):
|
||||
self._phase = RunPhase.HEATUP
|
||||
self._parent = None
|
||||
self.spaces = None
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
"""
|
||||
Get the phase of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase):
|
||||
"""
|
||||
Change the phase of the agent
|
||||
:param val: the new phase
|
||||
:return: None
|
||||
"""
|
||||
self._phase = val
|
||||
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset the episode parameters for the agent
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def train(self) -> Union[float, List]:
|
||||
"""
|
||||
Train the agents network
|
||||
:return: The loss of the training
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Get a decision of the next action to take.
|
||||
The action is dependent on the current state which the agent holds from resetting the environment or from
|
||||
the observe function.
|
||||
:return: A tuple containing the actual action and additional info on the action
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Gets a response from the environment.
|
||||
Processes this information for later use. For example, create a transition and store it in memory.
|
||||
The action info (a class containing any info the agent wants to store regarding its action decision process) is
|
||||
stored by the agent itself when deciding on the action.
|
||||
:param env_response: a EnvResponse containing the response from the environment
|
||||
:return: a done signal which is based on the agent knowledge. This can be different from the done signal from
|
||||
the environment. For example, an agent can decide to finish the episode each time it gets some
|
||||
intrinsic reward
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
|
||||
:param checkpoint_id: the checkpoint id to use for saving
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def get_predictions(self, states: Dict, prediction_type: PredictionType) -> np.ndarray:
|
||||
"""
|
||||
Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
|
||||
type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return: the agent's prediction
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
"""
|
||||
Pass a higher level command (directive) to the agent.
|
||||
For example, a higher level agent can set the goal of the agent.
|
||||
:param action: the directive to pass to the agent
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
81
rl_coach/agents/bc_agent.py
Normal file
81
rl_coach/agents/bc_agent.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.imitation_agent import ImitationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters, AlgorithmParameters, NetworkParameters, InputEmbedderParameters, \
|
||||
MiddlewareScheme
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class BCAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
|
||||
|
||||
class BCNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
|
||||
|
||||
class BCAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=BCAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": BCNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.bc_agent:BCAgent'
|
||||
|
||||
|
||||
# Behavioral Cloning Agent
|
||||
class BCAgent(ImitationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# When using a policy head, the targets refer to the advantages that we are normally feeding the head with.
|
||||
# In this case, we need the policy head to just predict probabilities, so while we usually train the network
|
||||
# with log(Pi)*Advantages, in this specific case we will train it to log(Pi), which after the softmax will
|
||||
# predict Pi (=probabilities)
|
||||
targets = np.ones(batch.actions().shape[0])
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
|
||||
'output_0_0': batch.actions()},
|
||||
targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
84
rl_coach/agents/bootstrapped_dqn_agent.py
Normal file
84
rl_coach/agents/bootstrapped_dqn_agent.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
|
||||
from rl_coach.exploration_policies.bootstrapped import BootstrappedParameters
|
||||
|
||||
|
||||
class BootstrappedDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_output_head_copies = 10
|
||||
self.rescale_gradient_from_head_by_factor = [1.0/self.num_output_head_copies]*self.num_output_head_copies
|
||||
|
||||
|
||||
class BootstrappedDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.network_wrappers = {"main": BootstrappedDQNNetworkParameters()}
|
||||
self.exploration = BootstrappedParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.bootstrapped_dqn_agent:BootstrappedDQNAgent'
|
||||
|
||||
|
||||
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
||||
class BootstrappedDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def reset_internal_state(self):
|
||||
super().reset_internal_state()
|
||||
self.exploration_policy.select_head()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
next_states_online_values = self.networks['main'].online_network.predict(batch.next_states(network_keys))
|
||||
result = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
q_st_plus_1 = result[:self.ap.exploration.architecture_num_q_heads]
|
||||
TD_targets = result[self.ap.exploration.architecture_num_q_heads:]
|
||||
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
mask = batch[i].info['mask']
|
||||
for head_idx in range(self.ap.exploration.architecture_num_q_heads):
|
||||
if mask[head_idx] == 1:
|
||||
selected_action = np.argmax(next_states_online_values[head_idx][i], 0)
|
||||
TD_targets[head_idx][i, batch.actions()[i]] = \
|
||||
batch.rewards()[i] + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount \
|
||||
* q_st_plus_1[head_idx][i][selected_action]
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def observe(self, env_response):
|
||||
mask = np.random.binomial(1, self.ap.exploration.bootstrapped_data_sharing_probability,
|
||||
self.ap.exploration.architecture_num_q_heads)
|
||||
env_response.info['mask'] = mask
|
||||
return super().observe(env_response)
|
||||
114
rl_coach/agents/categorical_dqn_agent.py
Normal file
114
rl_coach/agents/categorical_dqn_agent.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.dqn_agent import DQNNetworkParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.categorical_q_head import CategoricalQHeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import StateType
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class CategoricalDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.heads_parameters = [CategoricalQHeadParameters()]
|
||||
|
||||
|
||||
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.v_min = -10.0
|
||||
self.v_max = 10.0
|
||||
self.atoms = 51
|
||||
|
||||
|
||||
class CategoricalDQNExplorationParameters(EGreedyParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.evaluation_epsilon = 0.001
|
||||
|
||||
|
||||
class CategoricalDQNAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
|
||||
exploration=CategoricalDQNExplorationParameters(),
|
||||
memory=ExperienceReplayParameters(),
|
||||
networks={"main": CategoricalDQNNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.categorical_dqn_agent:CategoricalDQNAgent'
|
||||
|
||||
|
||||
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||
class CategoricalDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)
|
||||
|
||||
def distribution_prediction_to_q_values(self, prediction):
|
||||
return np.dot(prediction, self.z_values)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
prediction = self.get_prediction(states)
|
||||
q_values = self.distribution_prediction_to_q_values(prediction)
|
||||
else:
|
||||
q_values = None
|
||||
return q_values
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the action we actually took, the error is calculated by the atoms distribution
|
||||
# for all other actions, the error is 0
|
||||
distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
|
||||
m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
|
||||
|
||||
batches = np.arange(self.ap.network_wrappers['main'].batch_size)
|
||||
for j in range(self.z_values.size):
|
||||
tzj = np.fmax(np.fmin(batch.rewards() +
|
||||
(1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
|
||||
self.z_values[self.z_values.size - 1]),
|
||||
self.z_values[0])
|
||||
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
|
||||
u = (np.ceil(bj)).astype(int)
|
||||
l = (np.floor(bj)).astype(int)
|
||||
m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
|
||||
m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
|
||||
# total_loss = cross entropy between actual result above and predicted result for the given action
|
||||
TD_targets[batches, batch.actions()] = m
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
277
rl_coach/agents/clipped_ppo_agent.py
Normal file
277
rl_coach/agents/clipped_ppo_agent.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from random import shuffle
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, Batch, EnvResponse, StateType
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class ClippedPPONetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
|
||||
self.loss_weights = [1.0, 1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1, 1]
|
||||
self.batch_size = 64
|
||||
self.optimizer_type = 'Adam'
|
||||
self.clip_gradients = None
|
||||
self.use_separate_networks_per_head = True
|
||||
self.async_training = False
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
|
||||
|
||||
|
||||
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_episodes_in_experience_replay = 1000000
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
self.gae_lambda = 0.95
|
||||
self.use_kl_regularization = False
|
||||
self.clip_likelihood_ratio_using_epsilon = 0.2
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.beta_entropy = 0.01 # should be 0 for mujoco
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
|
||||
self.optimization_epochs = 10
|
||||
self.normalization_stats = None
|
||||
self.clipping_decay_schedule = ConstantSchedule(1)
|
||||
|
||||
|
||||
class ClippedPPOAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": ClippedPPONetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.clipped_ppo_agent:ClippedPPOAgent'
|
||||
|
||||
|
||||
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
||||
class ClippedPPOAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
# signals definition
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
self.value_targets = self.register_signal('Value Targets')
|
||||
self.kl_divergence = self.register_signal('KL Divergence')
|
||||
self.likelihood_ratio = self.register_signal('Likelihood Ratio')
|
||||
self.clipped_likelihood_ratio = self.register_signal('Clipped Likelihood Ratio')
|
||||
|
||||
|
||||
def set_session(self, sess):
|
||||
super().set_session(sess)
|
||||
if self.ap.algorithm.normalization_stats is not None:
|
||||
self.ap.algorithm.normalization_stats.set_session(sess)
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
current_state_values = self.networks['main'].online_network.predict(batch.states(network_keys))[0]
|
||||
current_state_values = current_state_values.squeeze()
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
value_targets = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = batch.total_returns() - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
value_targets = np.array([])
|
||||
for idx, game_over in enumerate(batch.game_overs()):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, gae_based_value_targets = \
|
||||
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
value_targets = np.append(value_targets, gae_based_value_targets)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
for transition, advantage, value_target in zip(batch.transitions, advantages, value_targets):
|
||||
transition.info['advantage'] = advantage
|
||||
transition.info['gae_based_value_target'] = value_target
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_network(self, batch, epochs):
|
||||
batch_results = []
|
||||
for j in range(epochs):
|
||||
batch.shuffle()
|
||||
batch_results = {
|
||||
'total_loss': [],
|
||||
'losses': [],
|
||||
'unclipped_grads': [],
|
||||
'kl_divergence': [],
|
||||
'entropy': []
|
||||
}
|
||||
|
||||
fetches = [self.networks['main'].online_network.output_heads[1].kl_divergence,
|
||||
self.networks['main'].online_network.output_heads[1].entropy,
|
||||
self.networks['main'].online_network.output_heads[1].likelihood_ratio,
|
||||
self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
|
||||
|
||||
for i in range(int(batch.size / self.ap.network_wrappers['main'].batch_size)):
|
||||
start = i * self.ap.network_wrappers['main'].batch_size
|
||||
end = (i + 1) * self.ap.network_wrappers['main'].batch_size
|
||||
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
actions = batch.actions()[start:end]
|
||||
gae_based_value_targets = batch.info('gae_based_value_target')[start:end]
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
|
||||
# TODO-perf - the target network ("old_policy") is not changing. this can be calculated once for all epochs.
|
||||
# the shuffling being done, should only be performed on the indices.
|
||||
result = self.networks['main'].target_network.predict({k: v[start:end] for k, v in batch.states(network_keys).items()})
|
||||
old_policy_distribution = result[1:]
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
if self.ap.algorithm.estimate_state_value_using_gae:
|
||||
value_targets = np.expand_dims(gae_based_value_targets, -1)
|
||||
else:
|
||||
value_targets = batch.total_returns(expand_dims=True)[start:end]
|
||||
|
||||
inputs = copy.copy({k: v[start:end] for k, v in batch.states(network_keys).items()})
|
||||
inputs['output_1_0'] = actions
|
||||
|
||||
# The old_policy_distribution needs to be represented as a list, because in the event of
|
||||
# discrete controls, it has just a mean. otherwise, it has both a mean and standard deviation
|
||||
for input_index, input in enumerate(old_policy_distribution):
|
||||
inputs['output_1_{}'.format(input_index + 1)] = input
|
||||
|
||||
inputs['output_1_3'] = self.ap.algorithm.clipping_decay_schedule.current_value
|
||||
|
||||
total_loss, losses, unclipped_grads, fetch_result = \
|
||||
self.networks['main'].train_and_sync_networks(
|
||||
inputs, [value_targets, batch.info('advantage')[start:end]], additional_fetches=fetches
|
||||
)
|
||||
|
||||
batch_results['total_loss'].append(total_loss)
|
||||
batch_results['losses'].append(losses)
|
||||
batch_results['unclipped_grads'].append(unclipped_grads)
|
||||
batch_results['kl_divergence'].append(fetch_result[0])
|
||||
batch_results['entropy'].append(fetch_result[1])
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
self.value_targets.add_sample(value_targets)
|
||||
self.likelihood_ratio.add_sample(fetch_result[2])
|
||||
self.clipped_likelihood_ratio.add_sample(fetch_result[3])
|
||||
|
||||
for key in batch_results.keys():
|
||||
batch_results[key] = np.mean(batch_results[key], 0)
|
||||
|
||||
self.value_loss.add_sample(batch_results['losses'][0])
|
||||
self.policy_loss.add_sample(batch_results['losses'][1])
|
||||
|
||||
if self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.networks['main'].online_network.get_variable_value(
|
||||
self.networks['main'].online_network.adaptive_learning_rate_scheme)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.ap.network_wrappers['main'].learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", batch_results['losses'][1]),
|
||||
("KL divergence", batch_results['kl_divergence']),
|
||||
("Entropy", batch_results['entropy']),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = batch_results['kl_divergence']
|
||||
self.entropy.add_sample(batch_results['entropy'])
|
||||
self.kl_divergence.add_sample(batch_results['kl_divergence'])
|
||||
return batch_results['losses']
|
||||
|
||||
def post_training_commands(self):
|
||||
# clean memory
|
||||
self.call_memory('clean')
|
||||
|
||||
def train(self):
|
||||
if self._should_train(wait_for_full_episode=True):
|
||||
dataset = self.memory.transitions
|
||||
dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
|
||||
batch = Batch(dataset)
|
||||
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
self.networks['main'].sync()
|
||||
self.fill_advantages(batch)
|
||||
|
||||
# take only the requested number of steps
|
||||
if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
|
||||
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
|
||||
shuffle(dataset)
|
||||
batch = Batch(dataset)
|
||||
|
||||
self.train_network(batch, self.ap.algorithm.optimization_epochs)
|
||||
|
||||
self.post_training_commands()
|
||||
self.training_iteration += 1
|
||||
# self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return None
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
self.ap.algorithm.clipping_decay_schedule.step()
|
||||
return super().choose_action(curr_state)
|
||||
415
rl_coach/agents/composite_agent.py
Normal file
415
rl_coach/agents/composite_agent.py
Normal file
@@ -0,0 +1,415 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
from enum import Enum
|
||||
from typing import Union, List, Dict
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.agent_interface import AgentInterface
|
||||
from rl_coach.base_parameters import AgentParameters, VisualizationParameters
|
||||
# from rl_coach.environments.environment_interface import ActionSpace
|
||||
from rl_coach.spaces import ActionSpace
|
||||
from rl_coach.spaces import AgentSelection, AttentionActionSpace, ObservationSpace, SpacesDefinition
|
||||
from rl_coach.utils import short_dynamic_import
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvResponse, ActionType, RunPhase
|
||||
from rl_coach.filters.observation.observation_crop_filter import ObservationCropFilter
|
||||
|
||||
|
||||
class DecisionPolicy(object):
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, decide on a single action to take.
|
||||
:param actions_info: a dictionary of agent names and their corresponding
|
||||
ActionInfo instances containing information for each agents action
|
||||
:return: a single action and the corresponding action info
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
|
||||
class SingleDecider(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action according to the agent that is currently in control.
|
||||
"""
|
||||
def __init__(self, default_decision_maker: str):
|
||||
super().__init__()
|
||||
self._decision_maker = default_decision_maker
|
||||
|
||||
@property
|
||||
def decision_maker(self):
|
||||
"""
|
||||
Get the decision maker that was set by the upper level control.
|
||||
"""
|
||||
return self._decision_maker
|
||||
|
||||
@decision_maker.setter
|
||||
def decision_maker(self, decision_maker: str):
|
||||
"""
|
||||
Set the decision maker by the upper level control.
|
||||
:param action: the incoming action from the upper level control.
|
||||
"""
|
||||
self._decision_maker = decision_maker
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action of the current decision maker
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
if self.decision_maker not in actions_info.keys():
|
||||
raise ValueError("The current decision maker ({}) does not exist in the given actions ({})"
|
||||
.format(self.decision_maker, actions_info.keys()))
|
||||
return actions_info[self.decision_maker]
|
||||
|
||||
|
||||
class RoundRobin(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action according to agents selected in a circular order.
|
||||
"""
|
||||
def __init__(self, num_agents: int):
|
||||
super().__init__()
|
||||
self.round_robin = itertools.cycle(range(num_agents))
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action of the current decision maker, which is set in a
|
||||
circular order
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
decision_maker = self.round_robin.__next__()
|
||||
if decision_maker not in range(len(actions_info.keys())):
|
||||
raise ValueError("The size of action_info does not match the number of agents set to RoundRobin decision"
|
||||
" policy.")
|
||||
return actions_info.items()[decision_maker]
|
||||
|
||||
|
||||
class MajorityVote(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action that most of the agents chose.
|
||||
This policy is only useful for discrete control.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action that most agents agree on
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
# TODO: enforce discrete action spaces
|
||||
if len(actions_info.keys()) == 0:
|
||||
raise ValueError("The given list of actions is empty")
|
||||
vote_count = np.bincount([action_info.action for action_info in actions_info.values()])
|
||||
majority_vote = np.argmax(vote_count)
|
||||
return actions_info.items()[majority_vote]
|
||||
|
||||
|
||||
class MeanDecision(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that takes the mean action given the actions of all the agents.
|
||||
This policy is only useful for continuous control.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the mean action
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
# TODO: enforce continuous action spaces
|
||||
if len(actions_info.keys()) == 0:
|
||||
raise ValueError("The given list of actions is empty")
|
||||
mean = np.mean([action_info.action for action_info in actions_info.values()], axis=0)
|
||||
return ActionInfo(mean)
|
||||
|
||||
|
||||
class RewardPolicy(Enum):
|
||||
ReachingGoal = 0
|
||||
NativeEnvironmentReward = 1
|
||||
AccumulatedEnvironmentRewards = 2
|
||||
|
||||
|
||||
class CompositeAgent(AgentInterface):
|
||||
"""
|
||||
A CompositeAgent is a group of agents in the same hierarchy level.
|
||||
In a CompositeAgent, each agent may take the role of either a controller or an observer.
|
||||
Each agent that is defined as observer, gets observations from the environment.
|
||||
Each agent that is defined as controller, can potentially also control the environment, in addition to observing it.
|
||||
There are several ways to decide on the action from different controller agents:
|
||||
1. Ensemble -
|
||||
- Take the majority vote (discrete controls)
|
||||
- Take the mean action (continuous controls)
|
||||
- Round robin between the agents (discrete/continuous)
|
||||
2. Skills -
|
||||
- At each step a single agent decides (Chosen by the uppoer hierarchy controlling agent)
|
||||
|
||||
A CompositeAgent can be controlled using one of the following methods (ActionSpaces):
|
||||
1. Goals (in terms of measurements, observation, embedding or a change in those values)
|
||||
2. Agent Selection (skills) / Discrete action space.
|
||||
3. Attention (a subset of the real environment observation / action space)
|
||||
"""
|
||||
def __init__(self,
|
||||
agents_parameters: Union[AgentParameters, Dict[str, AgentParameters]],
|
||||
visualization_parameters: VisualizationParameters,
|
||||
decision_policy: DecisionPolicy,
|
||||
out_action_space: ActionSpace,
|
||||
in_action_space: Union[None, ActionSpace]=None,
|
||||
decision_makers: Union[bool, Dict[str, bool]]=True,
|
||||
reward_policy: RewardPolicy=RewardPolicy.NativeEnvironmentReward,
|
||||
name="CompositeAgent"):
|
||||
"""
|
||||
Construct an agent group
|
||||
:param agents_parameters: a list of presets describing each one of the agents in the group
|
||||
:param decision_policy: the decision policy of the group which describes how actions are consolidated
|
||||
:param out_action_space: the type of action space that is used by this composite agent in order to control the
|
||||
underlying environment
|
||||
:param in_action_space: the type of action space that is used by the upper level agent in order to control this
|
||||
group
|
||||
:param decision_makers: a list of booleans representing for each corresponding agent if it has a decision
|
||||
privilege or if it is just an observer
|
||||
:param reward_policy: the type of the reward that the group receives
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if isinstance(agents_parameters, AgentParameters):
|
||||
decision_makers = {agents_parameters.name: True}
|
||||
agents_parameters = {agents_parameters.name: agents_parameters}
|
||||
self.agents_parameters = agents_parameters
|
||||
self.visualization_parameters = visualization_parameters
|
||||
self.decision_makers = decision_makers
|
||||
self.decision_policy = decision_policy
|
||||
self.in_action_space = in_action_space
|
||||
self.out_action_space = out_action_space # TODO: this is not being used
|
||||
self.reward_policy = reward_policy
|
||||
self.full_name_id = self.name = name
|
||||
self.current_decision_maker = 0
|
||||
self.environment = None
|
||||
self.agents = {} # key = agent_name, value = agent
|
||||
self.incoming_action = None
|
||||
self.last_state = None
|
||||
self._phase = RunPhase.HEATUP
|
||||
self.last_action_info = None
|
||||
self.current_episode = 0
|
||||
self.parent_level_manager = None
|
||||
|
||||
# environment spaces
|
||||
self.spaces = None
|
||||
|
||||
# counters for logging
|
||||
self.total_steps_counter = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
|
||||
# validate input
|
||||
if set(self.decision_makers) != set(self.agents_parameters):
|
||||
raise ValueError("The decision_makers dictionary keys does not match the names of the given agents")
|
||||
if sum(self.decision_makers.values()) > 1 and type(self.decision_policy) == SingleDecider \
|
||||
and type(self.in_action_space) != AgentSelection:
|
||||
raise ValueError("When the control policy is set to single decider, the master policy should control the"
|
||||
"agent group via agent selection (ControlType.AgentSelection)")
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the composite agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the composite agent.
|
||||
Additionally, updates the full name of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
if not hasattr(self._parent, 'name'):
|
||||
raise ValueError("The parent of a composite agent must have a name")
|
||||
self.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def create_agents(self):
|
||||
for agent_name, agent_parameters in self.agents_parameters.items():
|
||||
agent_parameters.name = agent_name
|
||||
|
||||
# create agent
|
||||
self.agents[agent_parameters.name] = short_dynamic_import(agent_parameters.path)(agent_parameters,
|
||||
parent=self)
|
||||
self.agents[agent_parameters.name].parent_level_manager = self.parent_level_manager
|
||||
|
||||
# TODO: this is a bit too specific to be defined here
|
||||
# add an attention cropping filter if the incoming directives are attention boxes
|
||||
if isinstance(self.in_action_space, AttentionActionSpace):
|
||||
attention_size = self.in_action_space.forced_attention_size
|
||||
for agent in self.agents.values():
|
||||
agent.input_filter.observation_filters['attention'] = \
|
||||
ObservationCropFilter(crop_low=np.zeros_like(attention_size), crop_high=attention_size)
|
||||
agent.input_filter.observation_filters.move_to_end('attention', last=False) # add the cropping at the beginning
|
||||
|
||||
def setup_logger(self) -> None:
|
||||
"""
|
||||
Setup the logger for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
[agent.setup_logger() for agent in self.agents.values()]
|
||||
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
[agent.set_session(sess) for agent in self.agents.values()]
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
:param spaces: the definitions of all the spaces of the environment
|
||||
:return: None
|
||||
"""
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
[agent.set_environment_parameters(self.spaces) for agent in self.agents.values()]
|
||||
|
||||
@property
|
||||
def phase(self):
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the current phase of all the agents in the group
|
||||
:param phase: the new phase
|
||||
:return: None
|
||||
"""
|
||||
self._phase = val
|
||||
for agent in self.agents.values():
|
||||
agent.phase = val
|
||||
|
||||
def end_episode(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode += 1
|
||||
[agent.handle_episode_ended() for agent in self.agents.values()]
|
||||
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset the episode for all the agents in the group
|
||||
:return: None
|
||||
"""
|
||||
# update counters
|
||||
self.total_steps_counter = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
|
||||
# reset all sub modules
|
||||
[agent.reset_internal_state() for agent in self.agents.values()]
|
||||
|
||||
def train(self) -> Union[float, List]:
|
||||
"""
|
||||
Make a single training step for all the agents of the group
|
||||
:return: a list of loss values from the training step
|
||||
"""
|
||||
return [agent.train() for agent in self.agents.values()]
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Get the actions from all the agents in the group. Then use the decision policy in order to
|
||||
extract a single action out of the list of actions.
|
||||
:return: the chosen action and its corresponding information
|
||||
"""
|
||||
|
||||
# update counters
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# get the actions info from all the agents
|
||||
actions_info = {}
|
||||
for agent_name, agent in self.agents.items():
|
||||
action_info = agent.act()
|
||||
actions_info[agent_name] = action_info
|
||||
|
||||
# decide on a single action to apply to the environment
|
||||
action_info = self.decision_policy.choose_action(actions_info)
|
||||
|
||||
# TODO: make the last action info a property?
|
||||
# pass the action info to all the observers
|
||||
for agent_name, is_decision_maker in self.decision_makers.items():
|
||||
if not is_decision_maker:
|
||||
self.agents[agent_name].last_action_info = action_info
|
||||
self.last_action_info = action_info
|
||||
|
||||
return self.last_action_info
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Given a response from the environment as a env_response, filter it and pass it to the agents.
|
||||
This method has two main jobs:
|
||||
1. Wrap the previous transition, ending with the new observation coming from EnvResponse.
|
||||
2. Save the next_state as the current_state to take action upon for the next call to act().
|
||||
|
||||
:param env_response:
|
||||
:param action_info: additional info about the chosen action
|
||||
:return:
|
||||
"""
|
||||
|
||||
# accumulate the unfiltered rewards for visualization
|
||||
self.total_reward_in_current_episode += env_response.reward
|
||||
|
||||
episode_ended = env_response.game_over
|
||||
|
||||
# pass the env_response to all the sub-agents
|
||||
# TODO: what if one agent decides to end the episode but the others don't? who decides?
|
||||
for agent_name, agent in self.agents.items():
|
||||
goal_reached = agent.observe(env_response)
|
||||
episode_ended = episode_ended or goal_reached
|
||||
|
||||
# TODO: unlike for a single agent, here we also treat a game over by the environment.
|
||||
# probably better to only return the agents' goal_reached decisions.
|
||||
return episode_ended
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
[agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
self.incoming_action = action
|
||||
if isinstance(self.decision_policy, SingleDecider) and isinstance(self.in_action_space, AgentSelection):
|
||||
self.decision_policy.decision_maker = list(self.agents.keys())[action]
|
||||
if isinstance(self.in_action_space, AttentionActionSpace):
|
||||
# TODO: redesign to be more modular
|
||||
for agent in self.agents.values():
|
||||
agent.input_filter.observation_filters['attention'].crop_low = action[0]
|
||||
agent.input_filter.observation_filters['attention'].crop_high = action[1]
|
||||
agent.output_filter.action_filters['masking'].set_masking(action[0], action[1])
|
||||
|
||||
# TODO rethink this scheme. we don't want so many if else clauses lying around here.
|
||||
# TODO - for incoming actions which do not involve setting the acting agent we should change the
|
||||
# observation_space, goal to pursue, etc accordingly to the incoming action.
|
||||
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the agent networks with the global network
|
||||
:return:
|
||||
"""
|
||||
[agent.sync() for agent in self.agents.values()]
|
||||
192
rl_coach/agents/ddpg_agent.py
Normal file
192
rl_coach/agents/ddpg_agent.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
|
||||
AgentParameters, InputEmbedderParameters, EmbedderScheme
|
||||
from rl_coach.exploration_policies.ou_process import OUProcessParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import BoxActionSpace, GoalsSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ddpg_actor_head import DDPGActorHeadParameters
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps
|
||||
|
||||
|
||||
class DDPGCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True),
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [VHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class DDPGActorNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True)}
|
||||
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
|
||||
self.heads_parameters = [DDPGActorHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.0001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class DDPGAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
self.rate_for_copying_weights_to_target = 0.001
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
self.use_target_network_for_evaluation = False
|
||||
self.action_penalty = 0
|
||||
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
|
||||
self.use_non_zero_discount_for_terminal_states = False
|
||||
|
||||
|
||||
class DDPGAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DDPGAlgorithmParameters(),
|
||||
exploration=OUProcessParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"actor": DDPGActorNetworkParameters(),
|
||||
"critic": DDPGCriticNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ddpg_agent:DDPGAgent'
|
||||
|
||||
|
||||
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
||||
class DDPGAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.q_values = self.register_signal("Q")
|
||||
self.TD_targets_signal = self.register_signal("TD targets")
|
||||
self.action_signal = self.register_signal("actions")
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
actor = self.networks['actor']
|
||||
critic = self.networks['critic']
|
||||
|
||||
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
|
||||
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
next_actions, actions_mean = actor.parallel_prediction([
|
||||
(actor.target_network, batch.next_states(actor_keys)),
|
||||
(actor.online_network, batch.states(actor_keys))
|
||||
])
|
||||
|
||||
critic_inputs = copy.copy(batch.next_states(critic_keys))
|
||||
critic_inputs['action'] = next_actions
|
||||
q_st_plus_1 = critic.target_network.predict(critic_inputs)
|
||||
|
||||
# calculate the bootstrapped TD targets while discounting terminal states according to
|
||||
# use_non_zero_discount_for_terminal_states
|
||||
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
|
||||
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
|
||||
else:
|
||||
TD_targets = batch.rewards(expand_dims=True) + \
|
||||
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
|
||||
|
||||
# clip the TD targets to prevent overestimation errors
|
||||
if self.ap.algorithm.clip_critic_targets:
|
||||
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
|
||||
|
||||
self.TD_targets_signal.add_sample(TD_targets)
|
||||
|
||||
# get the gradients of the critic output with respect to the action
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = actions_mean
|
||||
action_gradients = critic.online_network.predict(critic_inputs,
|
||||
outputs=critic.online_network.gradients_wrt_inputs[0]['action'])
|
||||
|
||||
# train the critic
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
|
||||
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
# apply the gradients from the critic to the actor
|
||||
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
|
||||
gradients = actor.online_network.predict(batch.states(actor_keys),
|
||||
outputs=actor.online_network.weighted_gradients[0],
|
||||
initial_feed_dict=initial_feed_dict)
|
||||
|
||||
if actor.has_global:
|
||||
actor.apply_gradients_to_global_network(gradients)
|
||||
actor.update_online_network()
|
||||
else:
|
||||
actor.apply_gradients_to_online_network(gradients)
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def train(self):
|
||||
return Agent.train(self)
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if not (isinstance(self.spaces.action, BoxActionSpace) or isinstance(self.spaces.action, GoalsSpace)):
|
||||
raise ValueError("DDPG works only for continuous control problems")
|
||||
# convert to batch so we can run it through the network
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
|
||||
if self.ap.algorithm.use_target_network_for_evaluation:
|
||||
actor_network = self.networks['actor'].target_network
|
||||
else:
|
||||
actor_network = self.networks['actor'].online_network
|
||||
|
||||
action_values = actor_network.predict(tf_input_state).squeeze()
|
||||
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
self.action_signal.add_sample(action)
|
||||
|
||||
# get q value
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'critic')
|
||||
action_batch = np.expand_dims(action, 0)
|
||||
if type(action) != np.ndarray:
|
||||
action_batch = np.array([[action]])
|
||||
tf_input_state['action'] = action_batch
|
||||
q_value = self.networks['critic'].online_network.predict(tf_input_state)[0]
|
||||
self.q_values.add_sample(q_value)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_value=q_value)
|
||||
|
||||
return action_info
|
||||
69
rl_coach/agents/ddqn_agent.py
Normal file
69
rl_coach/agents/ddqn_agent.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
|
||||
|
||||
class DDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.001
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ddqn_agent:DDQNAgent'
|
||||
|
||||
|
||||
# Double DQN - https://arxiv.org/abs/1509.06461
|
||||
class DDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
TD_errors = []
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
new_target = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
|
||||
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
|
||||
TD_targets[i, batch.actions()[i]] = new_target
|
||||
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
|
||||
importance_weights=importance_weights)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
219
rl_coach/agents/dfp_agent.py
Normal file
219
rl_coach/agents/dfp_agent.py
Normal file
@@ -0,0 +1,219 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from enum import Enum
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
|
||||
from rl_coach.architectures.tensorflow_components.heads.measurements_prediction_head import MeasurementsPredictionHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
|
||||
InputEmbedderParameters, MiddlewareScheme
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class HandlingTargetsAfterEpisodeEnd(Enum):
|
||||
LastStep = 0
|
||||
NAN = 1
|
||||
|
||||
|
||||
class DFPNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
|
||||
'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
|
||||
'goal': InputEmbedderParameters(activation_function='leaky_relu')}
|
||||
|
||||
self.input_embedders_parameters['observation'].scheme = [
|
||||
Conv2d([32, 8, 4]),
|
||||
Conv2d([64, 4, 2]),
|
||||
Conv2d([64, 3, 1]),
|
||||
Dense([512]),
|
||||
]
|
||||
|
||||
self.input_embedders_parameters['measurements'].scheme = [
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
]
|
||||
|
||||
self.input_embedders_parameters['goal'].scheme = [
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
]
|
||||
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
|
||||
scheme=MiddlewareScheme.Empty)
|
||||
self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = False
|
||||
self.batch_size = 64
|
||||
self.adam_optimizer_beta1 = 0.95
|
||||
|
||||
|
||||
class DFPMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
def __init__(self):
|
||||
self.max_size = (MemoryGranularity.Transitions, 20000)
|
||||
self.shared_memory = True
|
||||
super().__init__()
|
||||
|
||||
|
||||
class DFPAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_predicted_steps_ahead = 6
|
||||
self.goal_vector = [1.0, 1.0]
|
||||
self.future_measurements_weights = [0.5, 0.5, 1.0]
|
||||
self.use_accumulated_reward_as_measurement = False
|
||||
self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
|
||||
self.scale_measurements_targets = {}
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(8)
|
||||
|
||||
|
||||
class DFPAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DFPAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=DFPMemoryParameters(),
|
||||
networks={"main": DFPNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.dfp_agent:DFPAgent'
|
||||
|
||||
|
||||
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
||||
class DFPAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.current_goal = self.ap.algorithm.goal_vector
|
||||
self.target_measurements_scale_factors = None
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
network_inputs = batch.states(network_keys)
|
||||
network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
|
||||
self.ap.network_wrappers['main'].batch_size, axis=0)
|
||||
|
||||
# get the current outputs of the network
|
||||
targets = self.networks['main'].online_network.predict(network_inputs)
|
||||
|
||||
# change the targets for the taken actions
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
# predict the future measurements
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
|
||||
tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
|
||||
measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
|
||||
action_values = np.zeros(len(self.spaces.action.actions))
|
||||
num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)
|
||||
|
||||
# calculate the score of each action by multiplying it's future measurements with the goal vector
|
||||
for action_idx in range(len(self.spaces.action.actions)):
|
||||
action_measurements = measurements_future_prediction[action_idx]
|
||||
action_measurements = np.reshape(action_measurements,
|
||||
(self.ap.algorithm.num_predicted_steps_ahead,
|
||||
self.spaces.state['measurements'].shape[0]))
|
||||
future_steps_values = np.dot(action_measurements, self.current_goal)
|
||||
action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
|
||||
self.ap.algorithm.future_measurements_weights)
|
||||
else:
|
||||
action_values = None
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
if action_values is not None:
|
||||
action_values = action_values.squeeze()
|
||||
action_info = ActionInfo(action=action, action_value=action_values[action])
|
||||
else:
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
|
||||
measurements_names=
|
||||
self.spaces.state['measurements'].measurements_names)
|
||||
|
||||
# if the user has filled some scale values, check that he got the names right
|
||||
if set(self.spaces.state['measurements'].measurements_names).intersection(
|
||||
self.ap.algorithm.scale_measurements_targets.keys()) !=\
|
||||
set(self.ap.algorithm.scale_measurements_targets.keys()):
|
||||
raise ValueError("Some of the keys in parameter scale_measurements_targets ({}) are not defined in "
|
||||
"the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
|
||||
self.spaces.state['measurements'].measurements_names))
|
||||
|
||||
super().set_environment_parameters(self.spaces)
|
||||
|
||||
# the below is done after calling the base class method, as it might add accumulated reward as a measurement
|
||||
|
||||
# fill out the missing measurements scale factors
|
||||
for measurement_name in self.spaces.state['measurements'].measurements_names:
|
||||
if measurement_name not in self.ap.algorithm.scale_measurements_targets:
|
||||
self.ap.algorithm.scale_measurements_targets[measurement_name] = 1
|
||||
|
||||
self.target_measurements_scale_factors = \
|
||||
np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
|
||||
self.spaces.state['measurements'].measurements_names])
|
||||
|
||||
def handle_episode_ended(self):
|
||||
last_episode = self.current_episode_buffer
|
||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
|
||||
self._update_measurements_targets(last_episode,
|
||||
self.ap.algorithm.num_predicted_steps_ahead)
|
||||
super().handle_episode_ended()
|
||||
|
||||
def _update_measurements_targets(self, episode, num_steps):
|
||||
if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
|
||||
raise ValueError("Measurements are not present in the transitions of the last episode played. ")
|
||||
measurements_size = self.spaces.state['measurements'].shape[0]
|
||||
for transition_idx, transition in enumerate(episode.transitions):
|
||||
transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
|
||||
for step in range(num_steps):
|
||||
offset_idx = transition_idx + 2 ** step
|
||||
|
||||
if offset_idx >= episode.length():
|
||||
if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
|
||||
# the special MSE loss will ignore those entries so that the gradient will be 0 for these
|
||||
transition.info['future_measurements'][step] = np.nan
|
||||
continue
|
||||
|
||||
elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
|
||||
offset_idx = - 1
|
||||
|
||||
transition.info['future_measurements'][step] = \
|
||||
self.target_measurements_scale_factors * \
|
||||
(episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])
|
||||
99
rl_coach/agents/dqn_agent.py
Normal file
99
rl_coach/agents/dqn_agent.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
|
||||
InputEmbedderParameters, MiddlewareScheme
|
||||
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class DQNAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
self.discount = 0.99
|
||||
|
||||
|
||||
class DQNNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
|
||||
self.heads_parameters = [QHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class DQNAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DQNAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=ExperienceReplayParameters(),
|
||||
networks={"main": DQNNetworkParameters()})
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.05
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.dqn_agent:DQNAgent'
|
||||
|
||||
|
||||
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
||||
class DQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the action we actually took, the error is:
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
# # for all other actions, the error is 0
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
TD_errors = []
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
new_target = batch.rewards()[i] +\
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
|
||||
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
|
||||
TD_targets[i, batch.actions()[i]] = new_target
|
||||
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
|
||||
importance_weights=importance_weights)
|
||||
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
108
rl_coach/agents/hac_ddpg_agent.py
Normal file
108
rl_coach/agents/hac_ddpg_agent.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import copy
|
||||
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
|
||||
from rl_coach.core_types import RunPhase
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.time_limit = 40
|
||||
self.sub_goal_testing_rate = 0.5
|
||||
|
||||
|
||||
class HACDDPGAgentParameters(DDPGAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = HACDDPGAlgorithmParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.hac_ddpg_agent:HACDDPGAgent'
|
||||
|
||||
|
||||
# Hierarchical Actor Critic Generating Subgoals DDPG Agent - https://arxiv.org/pdf/1712.00948.pdf
|
||||
class HACDDPGAgent(DDPGAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
|
||||
self.graph_manager = None
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
|
||||
# testing phase
|
||||
|
||||
graph_manager = self.parent_level_manager.parent_graph_manager
|
||||
if self.ap.is_a_highest_level_agent:
|
||||
graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
|
||||
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
if graph_manager.should_test_current_sub_goal:
|
||||
self.exploration_policy.change_phase(RunPhase.TEST)
|
||||
else:
|
||||
self.exploration_policy.change_phase(self.phase)
|
||||
|
||||
action_info = super().choose_action(curr_state)
|
||||
return action_info
|
||||
|
||||
def update_transition_before_adding_to_replay_buffer(self, transition):
|
||||
graph_manager = self.parent_level_manager.parent_graph_manager
|
||||
|
||||
# deal with goals given from a higher level agent
|
||||
if not self.ap.is_a_highest_level_agent:
|
||||
transition.state['desired_goal'] = self.current_hrl_goal
|
||||
transition.next_state['desired_goal'] = self.current_hrl_goal
|
||||
# TODO: allow setting goals which are not part of the state. e.g. state-embedding using get_prediction
|
||||
self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
|
||||
self.current_hrl_goal, transition.next_state))
|
||||
goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
|
||||
self.current_hrl_goal, transition.next_state)
|
||||
transition.reward = goal_reward
|
||||
transition.game_over = transition.game_over or sub_goal_reached
|
||||
|
||||
# each level tests its own generated sub goals
|
||||
if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
|
||||
#TODO-fixme
|
||||
# _, sub_goal_reached = self.parent_level_manager.environment.agents['agent_1'].spaces.goal.\
|
||||
# get_reward_for_goal_and_state(transition.action, transition.next_state)
|
||||
|
||||
_, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
|
||||
transition.action, transition.next_state)
|
||||
|
||||
sub_goal_is_missed = not sub_goal_reached
|
||||
|
||||
if sub_goal_is_missed:
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
return transition
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
super().set_environment_parameters(spaces)
|
||||
|
||||
if self.ap.is_a_highest_level_agent:
|
||||
# the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
|
||||
# their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
|
||||
self.spaces.goal = self.spaces.action
|
||||
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
|
||||
|
||||
if not self.ap.is_a_highest_level_agent:
|
||||
self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward
|
||||
115
rl_coach/agents/human_agent.py
Normal file
115
rl_coach/agents/human_agent.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
import pygame
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.agents.bc_agent import BCNetworkParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, InputEmbedderParameters, EmbedderScheme, \
|
||||
AgentParameters
|
||||
from rl_coach.core_types import ActionInfo
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from pandas import to_pickle
|
||||
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class HumanAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
class HumanNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
|
||||
|
||||
class HumanAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=HumanAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": BCNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.human_agent:HumanAgent'
|
||||
|
||||
|
||||
class HumanAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.clock = pygame.time.Clock()
|
||||
self.max_fps = int(self.ap.visualization.max_fps_for_human_control)
|
||||
self.env = None
|
||||
|
||||
def init_environment_dependent_modules(self):
|
||||
super().init_environment_dependent_modules()
|
||||
self.env = self.parent_level_manager._real_environment
|
||||
screen.log_title("Human Control Mode")
|
||||
available_keys = self.env.get_available_keys()
|
||||
if available_keys:
|
||||
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
||||
screen.log("")
|
||||
for action, key in self.env.get_available_keys():
|
||||
screen.log("\t- {}: {}".format(action, key))
|
||||
screen.separator()
|
||||
|
||||
def train(self):
|
||||
return 0
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
action = ActionInfo(self.env.get_action_from_user(), action_value=0)
|
||||
action = self.output_filter.reverse_filter(action)
|
||||
|
||||
# keep constant fps
|
||||
self.clock.tick(self.max_fps)
|
||||
|
||||
if not self.env.renderer.is_open:
|
||||
self.save_replay_buffer_and_exit()
|
||||
|
||||
return action
|
||||
|
||||
def save_replay_buffer_and_exit(self):
|
||||
replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p')
|
||||
self.memory.tp = None
|
||||
to_pickle(self.memory, replay_buffer_path)
|
||||
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
||||
exit()
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
screen.log_dict(log, prefix="Recording")
|
||||
76
rl_coach/agents/imitation_agent.py
Normal file
76
rl_coach/agents/imitation_agent.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionInfo
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
# Imitation Agent
|
||||
class ImitationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.imitation = True
|
||||
|
||||
def extract_action_values(self, prediction):
|
||||
return prediction.squeeze()
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# convert to batch so we can run it through the network
|
||||
prediction = self.networks['main'].online_network.predict(self.prepare_batch_for_inference(curr_state, 'main'))
|
||||
|
||||
# get action values and extract the best action from it
|
||||
action_values = self.extract_action_values(prediction)
|
||||
if type(self.spaces.action) == DiscreteActionSpace:
|
||||
# DISCRETE
|
||||
self.exploration_policy.phase = RunPhase.TEST
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_probability=action_values[action])
|
||||
else:
|
||||
# CONTINUOUS
|
||||
action = action_values
|
||||
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
# for the training phase - we log during the episode to visualize the progress in training
|
||||
log = OrderedDict()
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Loss"] = self.loss.values[-1]
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix="Training")
|
||||
else:
|
||||
# for the evaluation phase - logging as in regular RL
|
||||
super().log_to_screen()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("ImitationAgent is an abstract agent. Not to be used directly.")
|
||||
72
rl_coach/agents/mmc_agent.py
Normal file
72
rl_coach/agents/mmc_agent.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
|
||||
|
||||
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
|
||||
class MixedMonteCarloAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = MixedMonteCarloAlgorithmParameters()
|
||||
self.memory = EpisodicExperienceReplayParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.mmc_agent:MixedMonteCarloAgent'
|
||||
|
||||
|
||||
class MixedMonteCarloAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the 1-step, we use the double-dqn target. hence actions are taken greedily according to the online network
|
||||
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
|
||||
|
||||
# TD_targets are initialized with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
one_step_target = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
|
||||
q_st_plus_1[i][selected_actions[i]]
|
||||
monte_carlo_target = batch.total_returns()[i]
|
||||
TD_targets[i, batch.actions()[i]] = (1 - self.mixing_rate) * one_step_target + \
|
||||
self.mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
126
rl_coach/agents/n_step_q_agent.py
Normal file
126
rl_coach/agents/n_step_q_agent.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
|
||||
InputEmbedderParameters
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.utils import last_sample
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class NStepQNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [QHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.async_training = True
|
||||
self.shared_optimizer = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class NStepQAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
self.apply_gradients_every_x_episodes = 1
|
||||
self.num_steps_between_gradient_updates = 5 # this is called t_max in all the papers
|
||||
self.targets_horizon = 'N-Step'
|
||||
|
||||
|
||||
class NStepQAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NStepQAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": NStepQNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.n_step_q_agent:NStepQAgent'
|
||||
|
||||
|
||||
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
||||
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.q_values = self.register_signal('Q Values')
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the values for the current states
|
||||
state_value_head_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
|
||||
# the targets for the state value estimator
|
||||
if self.ap.algorithm.targets_horizon == '1-Step':
|
||||
# 1-Step Q learning
|
||||
q_st_plus_1 = self.networks['main'].target_network.predict(batch.next_states(network_keys))
|
||||
|
||||
for i in reversed(range(batch.size)):
|
||||
state_value_head_targets[i][batch.actions()[i]] = \
|
||||
batch.rewards()[i] \
|
||||
+ (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
|
||||
|
||||
elif self.ap.algorithm.targets_horizon == 'N-Step':
|
||||
# N-Step Q learning
|
||||
if batch.game_overs()[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = np.max(self.networks['main'].target_network.predict(last_sample(batch.next_states(network_keys))))
|
||||
|
||||
for i in reversed(range(batch.size)):
|
||||
R = batch.rewards()[i] + self.ap.algorithm.discount * R
|
||||
state_value_head_targets[i][batch.actions()[i]] = R
|
||||
|
||||
else:
|
||||
assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
|
||||
|
||||
# train
|
||||
result = self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys), [state_value_head_targets])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.value_loss.add_sample(losses[0])
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def train(self):
|
||||
# update the target network of every network that has a target network
|
||||
if any([network.has_target for network in self.networks.values()]) \
|
||||
and self._should_update_online_weights_to_target():
|
||||
for network in self.networks.values():
|
||||
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return PolicyOptimizationAgent.train(self)
|
||||
126
rl_coach/agents/naf_agent.py
Normal file
126
rl_coach/agents/naf_agent.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.naf_head import NAFHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, \
|
||||
NetworkParameters, InputEmbedderParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import BoxActionSpace
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps
|
||||
from rl_coach.exploration_policies.ou_process import OUProcessParameters
|
||||
|
||||
|
||||
class NAFNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [NAFHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.learning_rate = 0.001
|
||||
self.async_training = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class NAFAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_consecutive_training_steps = 5
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
self.rate_for_copying_weights_to_target = 0.001
|
||||
|
||||
|
||||
class NAFAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NAFAlgorithmParameters(),
|
||||
exploration=OUProcessParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": NAFNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.naf_agent:NAFAgent'
|
||||
|
||||
|
||||
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
||||
class NAFAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.l_values = self.register_signal("L")
|
||||
self.a_values = self.register_signal("Advantage")
|
||||
self.mu_values = self.register_signal("Action")
|
||||
self.v_values = self.register_signal("V")
|
||||
self.TD_targets = self.register_signal("TD targets")
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# TD error = r + discount*v_st_plus_1 - q_st
|
||||
v_st_plus_1 = self.networks['main'].target_network.predict(
|
||||
batch.next_states(network_keys),
|
||||
self.networks['main'].target_network.output_heads[0].V,
|
||||
squeeze_output=False,
|
||||
)
|
||||
TD_targets = np.expand_dims(batch.rewards(), -1) + \
|
||||
(1.0 - np.expand_dims(batch.game_overs(), -1)) * self.ap.algorithm.discount * v_st_plus_1
|
||||
|
||||
self.TD_targets.add_sample(TD_targets)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
|
||||
'output_0_0': batch.actions(len(batch.actions().shape) == 1)
|
||||
}, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if type(self.spaces.action) != BoxActionSpace:
|
||||
raise ValueError('NAF works only for continuous control problems')
|
||||
|
||||
# convert to batch so we can run it through the network
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
|
||||
naf_head = self.networks['main'].online_network.output_heads[0]
|
||||
action_values = self.networks['main'].online_network.predict(tf_input_state, outputs=naf_head.mu,
|
||||
squeeze_output=False)
|
||||
|
||||
# get the actual action to use
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
# get the internal values for logging
|
||||
outputs = [naf_head.mu, naf_head.Q, naf_head.L, naf_head.A, naf_head.V]
|
||||
result = self.networks['main'].online_network.predict(
|
||||
{**tf_input_state, 'output_0_0': action_values},
|
||||
outputs=outputs
|
||||
)
|
||||
mu, Q, L, A, V = result
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(Q)
|
||||
self.l_values.add_sample(L)
|
||||
self.a_values.add_sample(A)
|
||||
self.mu_values.add_sample(mu)
|
||||
self.v_values.add_sample(V)
|
||||
|
||||
action_info = ActionInfo(action=action, action_value=Q)
|
||||
|
||||
return action_info
|
||||
176
rl_coach/agents/nec_agent.py
Normal file
176
rl_coach/agents/nec_agent.py
Normal file
@@ -0,0 +1,176 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.dnd_q_head import DNDQHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
|
||||
InputEmbedderParameters
|
||||
from rl_coach.core_types import RunPhase, EnvironmentSteps, Episode, StateType
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class NECNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [DNDQHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
|
||||
|
||||
class NECAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.dnd_size = 500000
|
||||
self.l2_norm_added_delta = 0.001
|
||||
self.new_value_shift_coefficient = 0.1
|
||||
self.number_of_knn = 50
|
||||
self.DND_key_error_threshold = 0
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
self.propagate_updates_to_DND = False
|
||||
self.n_step = 100
|
||||
self.bootstrap_total_return_from_old_policy = True
|
||||
|
||||
|
||||
class NECMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_size = (MemoryGranularity.Transitions, 100000)
|
||||
|
||||
|
||||
class NECAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NECAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=NECMemoryParameters(),
|
||||
networks={"main": NECNetworkParameters()})
|
||||
self.exploration.epsilon_schedule = ConstantSchedule(0.1)
|
||||
self.exploration.evaluation_epsilon = 0.01
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.nec_agent:NECAgent'
|
||||
|
||||
|
||||
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
||||
class NECAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.current_episode_state_embeddings = []
|
||||
self.training_started = False
|
||||
self.current_episode_buffer = \
|
||||
Episode(discount=self.ap.algorithm.discount,
|
||||
n_step=self.ap.algorithm.n_step,
|
||||
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn):
|
||||
return 0, [], 0
|
||||
else:
|
||||
if not self.training_started:
|
||||
self.training_started = True
|
||||
screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
|
||||
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
TD_targets[i, batch.actions()[i]] = batch.total_returns()[i]
|
||||
|
||||
# set the gradients to fetch for the DND update
|
||||
fetches = []
|
||||
head = self.networks['main'].online_network.output_heads[0]
|
||||
if self.ap.algorithm.propagate_updates_to_DND:
|
||||
fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices]
|
||||
|
||||
# train the neural network
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches)
|
||||
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
# update the DND keys and values using the extracted gradients
|
||||
if self.ap.algorithm.propagate_updates_to_DND:
|
||||
embedding_gradients = np.swapaxes(result[-1][0], 0, 1)
|
||||
value_gradients = np.swapaxes(result[-1][1], 0, 1)
|
||||
indices = np.swapaxes(result[-1][2], 0, 1)
|
||||
head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices)
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def act(self):
|
||||
if self.phase == RunPhase.HEATUP:
|
||||
# get embedding in heatup (otherwise we get it through get_prediction)
|
||||
embedding = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(self.curr_state, 'main'),
|
||||
outputs=self.networks['main'].online_network.state_embedding)
|
||||
self.current_episode_state_embeddings.append(embedding)
|
||||
|
||||
return super().act()
|
||||
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
# we need to store the state embeddings regardless if the action is random or not
|
||||
return self.get_prediction(states)
|
||||
|
||||
def get_prediction(self, states):
|
||||
# get the actions q values and the state embedding
|
||||
embedding, actions_q_values = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(states, 'main'),
|
||||
outputs=[self.networks['main'].online_network.state_embedding,
|
||||
self.networks['main'].online_network.output_heads[0].output]
|
||||
)
|
||||
if self.phase != RunPhase.TEST:
|
||||
# store the state embedding for inserting it to the DND later
|
||||
self.current_episode_state_embeddings.append(embedding.squeeze())
|
||||
actions_q_values = actions_q_values[0][0]
|
||||
return actions_q_values
|
||||
|
||||
def reset_internal_state(self):
|
||||
super().reset_internal_state()
|
||||
self.current_episode_state_embeddings = []
|
||||
self.current_episode_buffer = \
|
||||
Episode(discount=self.ap.algorithm.discount,
|
||||
n_step=self.ap.algorithm.n_step,
|
||||
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
|
||||
|
||||
def handle_episode_ended(self):
|
||||
super().handle_episode_ended()
|
||||
|
||||
# get the last full episode that we have collected
|
||||
episode = self.call_memory('get_last_complete_episode')
|
||||
if episode is not None and self.phase != RunPhase.TEST:
|
||||
assert len(self.current_episode_state_embeddings) == episode.length()
|
||||
returns = episode.get_transitions_attribute('total_return')
|
||||
actions = episode.get_transitions_attribute('action')
|
||||
self.networks['main'].online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
|
||||
actions, returns)
|
||||
|
||||
def save_checkpoint(self, checkpoint_id):
|
||||
with open(os.path.join(self.ap.task_parameters.save_checkpoint_dir, str(checkpoint_id) + '.dnd'), 'wb') as f:
|
||||
pickle.dump(self.networks['main'].online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL)
|
||||
94
rl_coach/agents/pal_agent.py
Normal file
94
rl_coach/agents/pal_agent.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay, \
|
||||
EpisodicExperienceReplayParameters
|
||||
|
||||
|
||||
class PALAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pal_alpha = 0.9
|
||||
self.persistent_advantage_learning = False
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
|
||||
class PALAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = PALAlgorithmParameters()
|
||||
self.memory = EpisodicExperienceReplayParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.pal_agent:PALAgent'
|
||||
|
||||
|
||||
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
||||
class PALAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.alpha = agent_parameters.algorithm.pal_alpha
|
||||
self.persistent = agent_parameters.algorithm.persistent_advantage_learning
|
||||
self.monte_carlo_mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# next state values
|
||||
q_st_plus_1_target, q_st_plus_1_online = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.next_states(network_keys))
|
||||
])
|
||||
selected_actions = np.argmax(q_st_plus_1_online, 1)
|
||||
v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
|
||||
|
||||
# current state values
|
||||
q_st_target, q_st_online = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
v_st_target = np.max(q_st_target, 1)
|
||||
|
||||
# calculate TD error
|
||||
TD_targets = np.copy(q_st_online)
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
TD_targets[i, batch.actions()[i]] = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
|
||||
q_st_plus_1_target[i][selected_actions[i]]
|
||||
advantage_learning_update = v_st_target[i] - q_st_target[i, batch.actions()[i]]
|
||||
next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
|
||||
# Persistent Advantage Learning or Regular Advantage Learning
|
||||
if self.persistent:
|
||||
TD_targets[i, batch.actions()[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
|
||||
else:
|
||||
TD_targets[i, batch.actions()[i]] -= self.alpha * advantage_learning_update
|
||||
|
||||
# mixing monte carlo updates
|
||||
monte_carlo_target = batch.total_returns()[i]
|
||||
TD_targets[i, batch.actions()[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, batch.actions()[i]] \
|
||||
+ self.monte_carlo_mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
105
rl_coach/agents/policy_gradients_agent.py
Normal file
105
rl_coach/agents/policy_gradients_agent.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.logger import screen
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class PolicyGradientNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
|
||||
|
||||
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.beta_entropy = 0
|
||||
self.num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
|
||||
|
||||
|
||||
class PolicyGradientsAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": PolicyGradientNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.policy_gradients_agent:PolicyGradientsAgent'
|
||||
|
||||
|
||||
class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.returns_mean = self.register_signal('Returns Mean')
|
||||
self.returns_variance = self.register_signal('Returns Variance')
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
total_returns = batch.total_returns()
|
||||
for i in reversed(range(batch.size)):
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
|
||||
total_returns[i] = total_returns[0]
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
|
||||
# just take the total return as it is
|
||||
pass
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
||||
if self.std_discounted_return != 0:
|
||||
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
||||
else:
|
||||
total_returns[i] = 0
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
targets = total_returns
|
||||
actions = batch.actions()
|
||||
if type(self.spaces.action) != DiscreteActionSpace and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
self.returns_mean.add_sample(np.mean(total_returns))
|
||||
self.returns_variance.add_sample(np.std(total_returns))
|
||||
|
||||
result = self.networks['main'].online_network.accumulate_gradients(
|
||||
{**batch.states(network_keys), 'output_0_0': actions}, targets
|
||||
)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
166
rl_coach/agents/policy_optimization_agent.py
Normal file
166
rl_coach/agents/policy_optimization_agent.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from collections import OrderedDict
|
||||
from enum import Enum
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.core_types import Batch, ActionInfo
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class PolicyGradientRescaler(Enum):
|
||||
TOTAL_RETURN = 0
|
||||
FUTURE_RETURN = 1
|
||||
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
||||
FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined
|
||||
Q_VALUE = 4
|
||||
A_VALUE = 5
|
||||
TD_RESIDUAL = 6
|
||||
DISCOUNTED_TD_RESIDUAL = 7
|
||||
GAE = 8
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
|
||||
class PolicyOptimizationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.policy_gradient_rescaler = None
|
||||
if hasattr(self.ap.algorithm, 'policy_gradient_rescaler'):
|
||||
self.policy_gradient_rescaler = self.ap.algorithm.policy_gradient_rescaler
|
||||
|
||||
# statistics for variance reduction
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.max_episode_length = 100000
|
||||
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
||||
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
||||
self.entropy = self.register_signal('Entropy')
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_episode_statistics(self, episode):
|
||||
episode_discounted_returns = []
|
||||
for i in range(episode.length()):
|
||||
transition = episode.get_transition(i)
|
||||
episode_discounted_returns.append(transition.total_return)
|
||||
self.num_episodes_where_step_has_been_seen[i] += 1
|
||||
self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_return_over_multiple_episodes[i] += transition.total_return / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_discounted_return = np.mean(episode_discounted_returns)
|
||||
self.std_discounted_return = np.std(episode_discounted_returns)
|
||||
|
||||
def get_current_episode(self):
|
||||
# we get the episode most of the time from the current episode buffer and only in the last transition from the
|
||||
# "memory" (where is was stored in the end of the episode)
|
||||
return self.memory.get_episode(0) or self.current_episode_buffer
|
||||
|
||||
def train(self):
|
||||
episode = self.get_current_episode()
|
||||
|
||||
# check if we should calculate gradients or skip
|
||||
episode_ended = episode.is_complete
|
||||
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
|
||||
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
|
||||
if not (is_t_max_steps_passed or episode_ended):
|
||||
return 0
|
||||
|
||||
total_loss = 0
|
||||
if num_steps_passed_since_last_update > 0:
|
||||
|
||||
# we need to update the returns of the episode until now
|
||||
episode.update_returns()
|
||||
|
||||
# get t_max transitions or less if the we got to a terminal state
|
||||
# will be used for both actor-critic and vanilla PG.
|
||||
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
|
||||
transitions = []
|
||||
start_idx = self.last_gradient_update_step_idx
|
||||
end_idx = episode.length()
|
||||
|
||||
for idx in range(start_idx, end_idx):
|
||||
transitions.append(episode.get_transition(idx))
|
||||
self.last_gradient_update_step_idx = end_idx
|
||||
|
||||
# update the statistics for the variance reduction techniques
|
||||
if self.policy_gradient_rescaler in \
|
||||
[PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
|
||||
PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
|
||||
self.update_episode_statistics(episode)
|
||||
|
||||
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
|
||||
batch = Batch(transitions)
|
||||
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
|
||||
if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
|
||||
for network in self.networks.values():
|
||||
network.apply_gradients_and_sync_networks()
|
||||
self.training_iteration += 1
|
||||
|
||||
# move the pointer to the next episode start and discard the episode.
|
||||
if episode_ended:
|
||||
# we need to remove the episode, because the next training iteration will be called before storing any
|
||||
# additional transitions in the memory (we don't store a transition for the first call to observe), so the
|
||||
# length of the memory won't be enforced and the old episode won't be removed
|
||||
self.call_memory('remove_episode', 0)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
return total_loss
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("PolicyOptimizationAgent is an abstract agent. Not to be used directly.")
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "main")
|
||||
return self.networks['main'].online_network.predict(tf_input_state)
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# convert to batch so we can run it through the network
|
||||
action_values = self.get_prediction(curr_state)
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
# DISCRETE
|
||||
action_probabilities = np.array(action_values).squeeze()
|
||||
action = self.exploration_policy.get_action(action_probabilities)
|
||||
action_info = ActionInfo(action=action,
|
||||
action_probability=action_probabilities[action])
|
||||
|
||||
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
|
||||
elif isinstance(self.spaces.action, BoxActionSpace):
|
||||
# CONTINUOUS
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
action_info = ActionInfo(action=action)
|
||||
else:
|
||||
raise ValueError("The action space of the environment is not compatible with the algorithm")
|
||||
return action_info
|
||||
338
rl_coach/agents/ppo_agent.py
Normal file
338
rl_coach/agents/ppo_agent.py
Normal file
@@ -0,0 +1,338 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters, DistributedTaskParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, Batch
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class PPOCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [VHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.batch_size = 128
|
||||
|
||||
|
||||
class PPOActorNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [PPOHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.batch_size = 128
|
||||
|
||||
|
||||
class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
self.gae_lambda = 0.96
|
||||
self.target_kl_divergence = 0.01
|
||||
self.initial_kl_coefficient = 1.0
|
||||
self.high_kl_penalty_coefficient = 1000
|
||||
self.clip_likelihood_ratio_using_epsilon = None
|
||||
self.value_targets_mix_fraction = 0.1
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.use_kl_regularization = True
|
||||
self.beta_entropy = 0.01
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
|
||||
|
||||
|
||||
class PPOAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=PPOAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ppo_agent:PPOAgent'
|
||||
|
||||
|
||||
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
|
||||
class PPOAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
# signals definition
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
self.kl_divergence = self.register_signal('KL Divergence')
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
batch = Batch(batch)
|
||||
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# * Found not to have any impact *
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(batch)
|
||||
|
||||
current_state_values = self.networks['critic'].online_network.predict(batch.states(network_keys)).squeeze()
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = batch.total_returns() - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
# current_state_values[batch.game_overs()] = 0
|
||||
for idx, game_over in enumerate(batch.game_overs()):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, _ = \
|
||||
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
# TODO: this will be problematic with a shared memory
|
||||
for transition, advantage in zip(self.memory.transitions, advantages):
|
||||
transition.info['advantage'] = advantage
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_value_network(self, dataset, epochs):
|
||||
loss = []
|
||||
batch = Batch(dataset)
|
||||
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# * Found not to have any impact *
|
||||
# add a timestep to the observation
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(dataset)
|
||||
|
||||
mix_fraction = self.ap.algorithm.value_targets_mix_fraction
|
||||
for j in range(epochs):
|
||||
curr_batch_size = batch.size
|
||||
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
|
||||
curr_batch_size = self.ap.network_wrappers['critic'].batch_size
|
||||
for i in range(batch.size // curr_batch_size):
|
||||
# split to batches for first order optimization techniques
|
||||
current_states_batch = {
|
||||
k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
|
||||
for k, v in batch.states(network_keys).items()
|
||||
}
|
||||
total_return_batch = batch.total_returns(True)[i * curr_batch_size:(i + 1) * curr_batch_size]
|
||||
old_policy_values = force_list(self.networks['critic'].target_network.predict(
|
||||
current_states_batch).squeeze())
|
||||
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
|
||||
targets = total_return_batch
|
||||
else:
|
||||
current_values = self.networks['critic'].online_network.predict(current_states_batch)
|
||||
targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
|
||||
|
||||
inputs = copy.copy(current_states_batch)
|
||||
for input_index, input in enumerate(old_policy_values):
|
||||
name = 'output_0_{}'.format(input_index)
|
||||
if name in self.networks['critic'].online_network.inputs:
|
||||
inputs[name] = input
|
||||
|
||||
value_loss = self.networks['critic'].online_network.accumulate_gradients(inputs, targets)
|
||||
|
||||
self.networks['critic'].apply_gradients_to_online_network()
|
||||
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
self.networks['critic'].apply_gradients_to_global_network()
|
||||
self.networks['critic'].online_network.reset_accumulated_gradients()
|
||||
|
||||
loss.append([value_loss[0]])
|
||||
loss = np.mean(loss, 0)
|
||||
return loss
|
||||
|
||||
def concat_state_and_timestep(self, dataset):
|
||||
current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
|
||||
for transition in dataset]
|
||||
current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
|
||||
return current_states_with_timestep
|
||||
|
||||
def train_policy_network(self, dataset, epochs):
|
||||
loss = []
|
||||
for j in range(epochs):
|
||||
loss = {
|
||||
'total_loss': [],
|
||||
'policy_losses': [],
|
||||
'unclipped_grads': [],
|
||||
'fetch_result': []
|
||||
}
|
||||
#shuffle(dataset)
|
||||
for i in range(len(dataset) // self.ap.network_wrappers['actor'].batch_size):
|
||||
batch = Batch(dataset[i * self.ap.network_wrappers['actor'].batch_size:
|
||||
(i + 1) * self.ap.network_wrappers['actor'].batch_size])
|
||||
|
||||
network_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
|
||||
|
||||
advantages = batch.info('advantage')
|
||||
actions = batch.actions()
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
old_policy = force_list(self.networks['actor'].target_network.predict(batch.states(network_keys)))
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
fetches = [self.networks['actor'].online_network.output_heads[0].kl_divergence,
|
||||
self.networks['actor'].online_network.output_heads[0].entropy]
|
||||
|
||||
inputs = copy.copy(batch.states(network_keys))
|
||||
inputs['output_0_0'] = actions
|
||||
|
||||
# old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
|
||||
# it has just a mean. otherwise, it has both a mean and standard deviation
|
||||
for input_index, input in enumerate(old_policy):
|
||||
inputs['output_0_{}'.format(input_index + 1)] = input
|
||||
|
||||
total_loss, policy_losses, unclipped_grads, fetch_result =\
|
||||
self.networks['actor'].online_network.accumulate_gradients(
|
||||
inputs, [advantages], additional_fetches=fetches)
|
||||
|
||||
self.networks['actor'].apply_gradients_to_online_network()
|
||||
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
self.networks['actor'].apply_gradients_to_global_network()
|
||||
|
||||
self.networks['actor'].online_network.reset_accumulated_gradients()
|
||||
|
||||
loss['total_loss'].append(total_loss)
|
||||
loss['policy_losses'].append(policy_losses)
|
||||
loss['unclipped_grads'].append(unclipped_grads)
|
||||
loss['fetch_result'].append(fetch_result)
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
for key in loss.keys():
|
||||
loss[key] = np.mean(loss[key], 0)
|
||||
|
||||
if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.networks['critic'].online_network.get_variable_value(self.ap.learning_rate)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.ap.network_wrappers['critic'].learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
|
||||
self.entropy.add_sample(loss['fetch_result'][1])
|
||||
self.kl_divergence.add_sample(loss['fetch_result'][0])
|
||||
return loss['total_loss']
|
||||
|
||||
def update_kl_coefficient(self):
|
||||
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
||||
# his implementation for now because we know it works well
|
||||
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||
|
||||
# update kl coefficient
|
||||
kl_target = self.ap.algorithm.target_kl_divergence
|
||||
kl_coefficient = self.networks['actor'].online_network.get_variable_value(
|
||||
self.networks['actor'].online_network.output_heads[0].kl_coefficient)
|
||||
new_kl_coefficient = kl_coefficient
|
||||
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
|
||||
# kl too high => increase regularization
|
||||
new_kl_coefficient *= 1.5
|
||||
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
|
||||
# kl too low => decrease regularization
|
||||
new_kl_coefficient /= 1.5
|
||||
|
||||
# update the kl coefficient variable
|
||||
if kl_coefficient != new_kl_coefficient:
|
||||
self.networks['actor'].online_network.set_variable_value(
|
||||
self.networks['actor'].online_network.output_heads[0].assign_kl_coefficient,
|
||||
new_kl_coefficient,
|
||||
self.networks['actor'].online_network.output_heads[0].kl_coefficient_ph)
|
||||
|
||||
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
||||
|
||||
def post_training_commands(self):
|
||||
if self.ap.algorithm.use_kl_regularization:
|
||||
self.update_kl_coefficient()
|
||||
|
||||
# clean memory
|
||||
self.call_memory('clean')
|
||||
|
||||
def train(self):
|
||||
loss = 0
|
||||
if self._should_train(wait_for_full_episode=True):
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
self.networks['actor'].sync()
|
||||
self.networks['critic'].sync()
|
||||
|
||||
dataset = self.memory.transitions
|
||||
|
||||
self.fill_advantages(dataset)
|
||||
|
||||
# take only the requested number of steps
|
||||
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
|
||||
|
||||
value_loss = self.train_value_network(dataset, 1)
|
||||
policy_loss = self.train_policy_network(dataset, 10)
|
||||
|
||||
self.value_loss.add_sample(value_loss)
|
||||
self.policy_loss.add_sample(policy_loss)
|
||||
|
||||
self.post_training_commands()
|
||||
self.training_iteration += 1
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(value_loss, policy_loss)
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "actor")
|
||||
return self.networks['actor'].online_network.predict(tf_input_state)
|
||||
112
rl_coach/agents/qr_dqn_agent.py
Normal file
112
rl_coach/agents/qr_dqn_agent.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.architectures.tensorflow_components.heads.quantile_regression_q_head import QuantileRegressionQHeadParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.core_types import StateType
|
||||
|
||||
|
||||
class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.heads_parameters = [QuantileRegressionQHeadParameters()]
|
||||
self.learning_rate = 0.00005
|
||||
self.optimizer_epsilon = 0.01 / 32
|
||||
|
||||
|
||||
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.atoms = 200
|
||||
self.huber_loss_interval = 1 # called k in the paper
|
||||
|
||||
|
||||
class QuantileRegressionDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = QuantileRegressionDQNAlgorithmParameters()
|
||||
self.network_wrappers = {"main": QuantileRegressionDQNNetworkParameters()}
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.001
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.qr_dqn_agent:QuantileRegressionDQNAgent'
|
||||
|
||||
|
||||
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
|
||||
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.quantile_probabilities = np.ones(self.ap.algorithm.atoms) / float(self.ap.algorithm.atoms)
|
||||
|
||||
def get_q_values(self, quantile_values):
|
||||
return np.dot(quantile_values, self.quantile_probabilities)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
quantile_values = self.get_prediction(states)
|
||||
actions_q_values = self.get_q_values(quantile_values)
|
||||
else:
|
||||
actions_q_values = None
|
||||
return actions_q_values
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the quantiles of the next states and current states
|
||||
next_state_quantiles, current_quantiles = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# get the optimal actions to take for the next states
|
||||
target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1)
|
||||
|
||||
# calculate the Bellman update
|
||||
batch_idx = list(range(self.ap.network_wrappers['main'].batch_size))
|
||||
|
||||
TD_targets = batch.rewards(True) + (1.0 - batch.game_overs(True)) * self.ap.algorithm.discount \
|
||||
* next_state_quantiles[batch_idx, target_actions]
|
||||
|
||||
# get the locations of the selected actions within the batch for indexing purposes
|
||||
actions_locations = [[b, a] for b, a in zip(batch_idx, batch.actions())]
|
||||
|
||||
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
|
||||
cumulative_probabilities = np.array(range(self.ap.algorithm.atoms + 1)) / float(self.ap.algorithm.atoms) # tau_i
|
||||
quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1]) # tau^hat_i
|
||||
quantile_midpoints = np.tile(quantile_midpoints, (self.ap.network_wrappers['main'].batch_size, 1))
|
||||
sorted_quantiles = np.argsort(current_quantiles[batch_idx, batch.actions()])
|
||||
for idx in range(self.ap.network_wrappers['main'].batch_size):
|
||||
quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]]
|
||||
|
||||
# train
|
||||
result = self.networks['main'].train_and_sync_networks({
|
||||
**batch.states(network_keys),
|
||||
'output_0_0': actions_locations,
|
||||
'output_0_1': quantile_midpoints,
|
||||
}, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
98
rl_coach/agents/value_optimization_agent.py
Normal file
98
rl_coach/agents/value_optimization_agent.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.core_types import ActionInfo, StateType
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
|
||||
class ValueOptimizationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.q_values = self.register_signal("Q")
|
||||
self.q_value_for_action = {}
|
||||
|
||||
def init_environment_dependent_modules(self):
|
||||
super().init_environment_dependent_modules()
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
for i in range(len(self.spaces.action.actions)):
|
||||
self.q_value_for_action[i] = self.register_signal("Q for action {}".format(i),
|
||||
dump_one_value_per_episode=False,
|
||||
dump_one_value_per_step=True)
|
||||
|
||||
# Algorithms for which q_values are calculated from predictions will override this function
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
actions_q_values = self.get_prediction(states)
|
||||
else:
|
||||
actions_q_values = None
|
||||
return actions_q_values
|
||||
|
||||
def get_prediction(self, states):
|
||||
return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'))
|
||||
|
||||
def update_transition_priorities_and_get_weights(self, TD_errors, batch):
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = None
|
||||
if isinstance(self.memory, PrioritizedExperienceReplay):
|
||||
self.call_memory('update_priorities', (batch.info('idx'), TD_errors))
|
||||
importance_weights = batch.info('weight')
|
||||
return importance_weights
|
||||
|
||||
def _validate_action(self, policy, action):
|
||||
if np.array(action).shape != ():
|
||||
raise ValueError((
|
||||
'The exploration_policy {} returned a vector of actions '
|
||||
'instead of a single action. ValueOptimizationAgents '
|
||||
'require exploration policies which return a single action.'
|
||||
).format(policy.__class__.__name__))
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
actions_q_values = self.get_all_q_values_for_states(curr_state)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
action = self.exploration_policy.get_action(actions_q_values)
|
||||
self._validate_action(self.exploration_policy, action)
|
||||
|
||||
if actions_q_values is not None:
|
||||
# this is for bootstrapped dqn
|
||||
if type(actions_q_values) == list and len(actions_q_values) > 0:
|
||||
actions_q_values = self.exploration_policy.last_action_values
|
||||
actions_q_values = actions_q_values.squeeze()
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(actions_q_values)
|
||||
for i, q_value in enumerate(actions_q_values):
|
||||
self.q_value_for_action[i].add_sample(q_value)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_value=actions_q_values[action],
|
||||
max_action_value=np.max(actions_q_values))
|
||||
else:
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")
|
||||
Reference in New Issue
Block a user