mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 03:30:19 +01:00
pre-release 0.10.0
This commit is contained in:
0
rl_coach/__init__.py
Normal file
0
rl_coach/__init__.py
Normal file
15
rl_coach/agents/__init__.py
Normal file
15
rl_coach/agents/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
165
rl_coach/agents/actor_critic_agent.py
Normal file
165
rl_coach/agents/actor_critic_agent.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.utils import last_sample
|
||||
|
||||
from rl_coach.logger import screen
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class ActorCriticAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.A_VALUE
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.beta_entropy = 0
|
||||
self.num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
|
||||
self.gae_lambda = 0.96
|
||||
self.estimate_state_value_using_gae = False
|
||||
|
||||
|
||||
class ActorCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [VHeadParameters(), PolicyHeadParameters()]
|
||||
self.loss_weights = [0.5, 1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1, 1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.clip_gradients = 40.0
|
||||
self.async_training = True
|
||||
|
||||
|
||||
class ActorCriticAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=ActorCriticAlgorithmParameters(),
|
||||
exploration=None, #TODO this should be different for continuous (ContinuousEntropyExploration)
|
||||
# and discrete (CategoricalExploration) action spaces.
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": ActorCriticNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.actor_critic_agent:ActorCriticAgent'
|
||||
|
||||
|
||||
# Actor Critic - https://arxiv.org/abs/1602.01783
|
||||
class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.action_advantages = self.register_signal('Advantages')
|
||||
self.state_values = self.register_signal('Values')
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
|
||||
# Discounting function used to calculate discounted returns.
|
||||
def discount(self, x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
def get_general_advantage_estimation_values(self, rewards, values):
|
||||
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
||||
bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
|
||||
|
||||
# Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
|
||||
# although in practice works even in much smaller Tmax values, e.g. 20)
|
||||
deltas = rewards + self.ap.algorithm.discount * values[1:] - values[:-1]
|
||||
gae = self.discount(deltas, self.ap.algorithm.discount * self.ap.algorithm.gae_lambda)
|
||||
|
||||
if self.ap.algorithm.estimate_state_value_using_gae:
|
||||
discounted_returns = np.expand_dims(gae + values[:-1], -1)
|
||||
else:
|
||||
discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
|
||||
self.ap.algorithm.discount)), 1)[:-1]
|
||||
return gae, discounted_returns
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the values for the current states
|
||||
|
||||
result = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
current_state_values = result[0]
|
||||
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# the targets for the state value estimator
|
||||
num_transitions = batch.size
|
||||
state_value_head_targets = np.zeros((num_transitions, 1))
|
||||
|
||||
# estimate the advantage function
|
||||
action_advantages = np.zeros((num_transitions, 1))
|
||||
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if batch.game_overs()[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
R = batch.rewards()[i] + self.ap.algorithm.discount * R
|
||||
state_value_head_targets[i] = R
|
||||
action_advantages[i] = R - current_state_values[i]
|
||||
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
|
||||
values = np.append(current_state_values, bootstrapped_value)
|
||||
if batch.game_overs()[-1]:
|
||||
values[-1] = 0
|
||||
|
||||
# get general discounted returns table
|
||||
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
|
||||
action_advantages = np.vstack(gae_values)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
action_advantages = action_advantages.squeeze(axis=-1)
|
||||
actions = batch.actions()
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# train
|
||||
result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
|
||||
'output_1_0': actions},
|
||||
[state_value_head_targets, action_advantages])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.action_advantages.add_sample(action_advantages)
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
self.value_loss.add_sample(losses[0])
|
||||
self.policy_loss.add_sample(losses[1])
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "main")
|
||||
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value
|
||||
791
rl_coach/agents/agent.py
Normal file
791
rl_coach/agents/agent.py
Normal file
@@ -0,0 +1,791 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import random
|
||||
from collections import OrderedDict
|
||||
from typing import Dict, List, Union, Tuple
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.agent_interface import AgentInterface
|
||||
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
|
||||
from rl_coach.core_types import RunPhase, PredictionType, EnvironmentEpisodes, ActionType, Batch, Episode, StateType
|
||||
from rl_coach.core_types import Transition, ActionInfo, TrainingSteps, EnvironmentSteps, EnvResponse
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
|
||||
from pandas import read_pickle
|
||||
from six.moves import range
|
||||
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace, GoalsSpace, AttentionActionSpace
|
||||
from rl_coach.utils import Signal, force_list, set_cpu
|
||||
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
|
||||
|
||||
from rl_coach.architectures.network_wrapper import NetworkWrapper
|
||||
from rl_coach.logger import screen, Logger, EpisodeLogger
|
||||
|
||||
|
||||
class Agent(AgentInterface):
|
||||
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
"""
|
||||
:param agent_parameters: A Preset class instance with all the running paramaters
|
||||
"""
|
||||
super().__init__()
|
||||
self.ap = agent_parameters
|
||||
self.task_id = self.ap.task_parameters.task_index
|
||||
self.is_chief = self.task_id == 0
|
||||
self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
|
||||
and self.ap.memory.shared_memory
|
||||
if self.shared_memory:
|
||||
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
|
||||
self.name = agent_parameters.name
|
||||
self.parent = parent
|
||||
self.parent_level_manager = None
|
||||
self.full_name_id = agent_parameters.full_name_id = self.name
|
||||
|
||||
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
|
||||
screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
|
||||
"tensorflow wake up time)".format(self.full_name_id, self.task_id))
|
||||
else:
|
||||
screen.log_title("Creating agent - name: {}".format(self.full_name_id))
|
||||
self.imitation = False
|
||||
self.agent_logger = Logger()
|
||||
self.agent_episode_logger = EpisodeLogger()
|
||||
|
||||
# get the memory
|
||||
# - distributed training + shared memory:
|
||||
# * is chief? -> create the memory and add it to the scratchpad
|
||||
# * not chief? -> wait for the chief to create the memory and then fetch it
|
||||
# - non distributed training / not shared memory:
|
||||
# * create memory
|
||||
memory_name = self.ap.memory.path.split(':')[1]
|
||||
self.memory_lookup_name = self.full_name_id + '.' + memory_name
|
||||
if self.shared_memory and not self.is_chief:
|
||||
self.memory = self.shared_memory_scratchpad.get(self.memory_lookup_name)
|
||||
else:
|
||||
# modules
|
||||
if agent_parameters.memory.load_memory_from_file_path:
|
||||
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
||||
.format(agent_parameters.memory.load_memory_from_file_path))
|
||||
self.memory = read_pickle(agent_parameters.memory.load_memory_from_file_path)
|
||||
else:
|
||||
self.memory = dynamic_import_and_instantiate_module_from_params(self.ap.memory)
|
||||
|
||||
if self.shared_memory and self.is_chief:
|
||||
self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory)
|
||||
|
||||
# set devices
|
||||
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
|
||||
self.has_global = True
|
||||
self.replicated_device = agent_parameters.task_parameters.device
|
||||
self.worker_device = "/job:worker/task:{}".format(self.task_id)
|
||||
else:
|
||||
self.has_global = False
|
||||
self.replicated_device = None
|
||||
self.worker_device = ""
|
||||
if agent_parameters.task_parameters.use_cpu:
|
||||
self.worker_device += "/cpu:0"
|
||||
else:
|
||||
self.worker_device += "/device:GPU:0"
|
||||
|
||||
# filters
|
||||
self.input_filter = self.ap.input_filter
|
||||
self.output_filter = self.ap.output_filter
|
||||
self.pre_network_filter = self.ap.pre_network_filter
|
||||
device = self.replicated_device if self.replicated_device else self.worker_device
|
||||
self.input_filter.set_device(device)
|
||||
self.output_filter.set_device(device)
|
||||
self.pre_network_filter.set_device(device)
|
||||
|
||||
|
||||
# initialize all internal variables
|
||||
self._phase = RunPhase.HEATUP
|
||||
self.total_shaped_reward_in_current_episode = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.total_steps_counter = 0
|
||||
self.running_reward = None
|
||||
self.training_iteration = 0
|
||||
self.last_target_network_update_step = 0
|
||||
self.last_training_phase_step = 0
|
||||
self.current_episode = self.ap.current_episode = 0
|
||||
self.curr_state = {}
|
||||
self.current_hrl_goal = None
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
self.last_episode_evaluation_ran = 0
|
||||
self.running_observations = []
|
||||
self.agent_logger.set_current_time(self.current_episode)
|
||||
self.exploration_policy = None
|
||||
self.networks = {}
|
||||
self.last_action_info = None
|
||||
self.running_observation_stats = None
|
||||
self.running_reward_stats = None
|
||||
self.accumulated_rewards_across_evaluation_episodes = 0
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
|
||||
self.num_successes_across_evaluation_episodes = 0
|
||||
self.num_evaluation_episodes_completed = 0
|
||||
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
|
||||
# TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)
|
||||
|
||||
# environment parameters
|
||||
self.spaces = None
|
||||
self.in_action_space = self.ap.algorithm.in_action_space
|
||||
|
||||
# signals
|
||||
self.episode_signals = []
|
||||
self.step_signals = []
|
||||
self.loss = self.register_signal('Loss')
|
||||
self.curr_learning_rate = self.register_signal('Learning Rate')
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
|
||||
self.shaped_reward = self.register_signal('Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.distance_from_goal = self.register_signal('Distance From Goal', dump_one_value_per_step=True)
|
||||
|
||||
# use seed
|
||||
if self.ap.task_parameters.seed is not None:
|
||||
random.seed(self.ap.task_parameters.seed)
|
||||
np.random.seed(self.ap.task_parameters.seed)
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the agent.
|
||||
Additionally, updates the full name of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
if self._parent is not None:
|
||||
if not hasattr(self._parent, 'name'):
|
||||
raise ValueError("The parent of an agent must have a name")
|
||||
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def setup_logger(self):
|
||||
# dump documentation
|
||||
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
|
||||
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
|
||||
level_name=self.parent_level_manager.name,
|
||||
agent_full_id='.'.join(self.full_name_id.split('/')))
|
||||
self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path, logger_prefix=logger_prefix,
|
||||
add_timestamp=True, task_id=self.task_id)
|
||||
if self.ap.visualization.dump_in_episode_signals:
|
||||
self.agent_episode_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,
|
||||
logger_prefix=logger_prefix,
|
||||
add_timestamp=True, task_id=self.task_id)
|
||||
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
self.input_filter.set_session(sess)
|
||||
self.output_filter.set_session(sess)
|
||||
self.pre_network_filter.set_session(sess)
|
||||
[network.set_session(sess) for network in self.networks.values()]
|
||||
|
||||
def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True,
|
||||
dump_one_value_per_step: bool=False) -> Signal:
|
||||
"""
|
||||
Register a signal such that its statistics will be dumped and be viewable through dashboard
|
||||
:param signal_name: the name of the signal as it will appear in dashboard
|
||||
:param dump_one_value_per_episode: should the signal value be written for each episode?
|
||||
:param dump_one_value_per_step: should the signal value be written for each step?
|
||||
:return: the created signal
|
||||
"""
|
||||
signal = Signal(signal_name)
|
||||
if dump_one_value_per_episode:
|
||||
self.episode_signals.append(signal)
|
||||
if dump_one_value_per_step:
|
||||
self.step_signals.append(signal)
|
||||
return signal
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
:param spaces: the environment spaces definition
|
||||
:return: None
|
||||
"""
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
|
||||
if self.ap.algorithm.use_accumulated_reward_as_measurement:
|
||||
if 'measurements' in self.spaces.state.sub_spaces:
|
||||
self.spaces.state['measurements'].shape += 1
|
||||
self.spaces.state['measurements'].measurements_names += ['accumulated_reward']
|
||||
else:
|
||||
self.spaces.state['measurements'] = VectorObservationSpace(1, measurements_names=['accumulated_reward'])
|
||||
|
||||
for observation_name in self.spaces.state.sub_spaces.keys():
|
||||
self.spaces.state[observation_name] = \
|
||||
self.pre_network_filter.get_filtered_observation_space(observation_name,
|
||||
self.input_filter.get_filtered_observation_space(observation_name,
|
||||
self.spaces.state[observation_name]))
|
||||
|
||||
self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
|
||||
self.input_filter.get_filtered_reward_space(self.spaces.reward))
|
||||
|
||||
self.spaces.action = self.output_filter.get_unfiltered_action_space(self.spaces.action)
|
||||
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
# TODO: what if the goal type is an embedding / embedding change?
|
||||
self.spaces.goal = self.in_action_space
|
||||
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
|
||||
|
||||
self.init_environment_dependent_modules()
|
||||
|
||||
def create_networks(self) -> Dict[str, NetworkWrapper]:
|
||||
"""
|
||||
Create all the networks of the agent.
|
||||
The network creation will be done after setting the environment parameters for the agent, since they are needed
|
||||
for creating the network.
|
||||
:return: A list containing all the networks
|
||||
"""
|
||||
networks = {}
|
||||
for network_name in sorted(self.ap.network_wrappers.keys()):
|
||||
networks[network_name] = NetworkWrapper(name=network_name,
|
||||
agent_parameters=self.ap,
|
||||
has_target=self.ap.network_wrappers[network_name].create_target_network,
|
||||
has_global=self.has_global,
|
||||
spaces=self.spaces,
|
||||
replicated_device=self.replicated_device,
|
||||
worker_device=self.worker_device)
|
||||
return networks
|
||||
|
||||
def init_environment_dependent_modules(self) -> None:
|
||||
"""
|
||||
Initialize any modules that depend on knowing information about the environment such as the action space or
|
||||
the observation space
|
||||
:return: None
|
||||
"""
|
||||
# initialize exploration policy
|
||||
self.ap.exploration.action_space = self.spaces.action
|
||||
self.exploration_policy = dynamic_import_and_instantiate_module_from_params(self.ap.exploration)
|
||||
|
||||
# create all the networks of the agent
|
||||
self.networks = self.create_networks()
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the phase of the run for the agent and all the sub components
|
||||
:param phase: the new run phase (TRAIN, TEST, etc.)
|
||||
:return: None
|
||||
"""
|
||||
self.reset_evaluation_state(val)
|
||||
self._phase = val
|
||||
self.exploration_policy.change_phase(val)
|
||||
|
||||
def reset_evaluation_state(self, val: RunPhase) -> None:
|
||||
starting_evaluation = (val == RunPhase.TEST)
|
||||
ending_evaluation = (self.phase == RunPhase.TEST)
|
||||
|
||||
if starting_evaluation:
|
||||
self.accumulated_rewards_across_evaluation_episodes = 0
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
|
||||
self.num_successes_across_evaluation_episodes = 0
|
||||
self.num_evaluation_episodes_completed = 0
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
screen.log_title("{}: Starting evaluation phase".format(self.name))
|
||||
|
||||
elif ending_evaluation:
|
||||
# we write to the next episode, because it could be that the current episode was already written
|
||||
# to disk and then we won't write it again
|
||||
self.agent_logger.set_current_time(self.current_episode + 1)
|
||||
self.agent_logger.create_signal_value(
|
||||
'Evaluation Reward',
|
||||
self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
|
||||
self.agent_logger.create_signal_value(
|
||||
'Shaped Evaluation Reward',
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
|
||||
success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
|
||||
self.agent_logger.create_signal_value(
|
||||
"Success Rate",
|
||||
success_rate
|
||||
)
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
screen.log_title("{}: Finished evaluation phase. Success rate = {}"
|
||||
.format(self.name, np.round(success_rate, 2)))
|
||||
|
||||
def call_memory(self, func, args=()):
|
||||
"""
|
||||
This function is a wrapper to allow having the same calls for shared or unshared memories.
|
||||
It should be used instead of calling the memory directly in order to allow different algorithms to work
|
||||
both with a shared and a local memory.
|
||||
:param func: the name of the memory function to call
|
||||
:param args: the arguments to supply to the function
|
||||
:return: the return value of the function
|
||||
"""
|
||||
if self.shared_memory:
|
||||
result = self.shared_memory_scratchpad.internal_call(self.memory_lookup_name, func, args)
|
||||
else:
|
||||
if type(args) != tuple:
|
||||
args = (args,)
|
||||
result = getattr(self.memory, func)(*args)
|
||||
return result
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
|
||||
log["Exploration"] = np.round(self.exploration_policy.get_control_param(), 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_step_in_episode_log(self):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
self.agent_episode_logger.set_current_time(self.current_episode_steps_counter)
|
||||
self.agent_episode_logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
self.agent_episode_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
|
||||
self.agent_episode_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
|
||||
self.agent_episode_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
|
||||
self.agent_episode_logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
self.agent_episode_logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||
self.agent_episode_logger.create_signal_value("Shaped Accumulated Reward", self.total_shaped_reward_in_current_episode)
|
||||
self.agent_episode_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
self.agent_episode_logger.update_wall_clock_time(self.current_episode_steps_counter)
|
||||
|
||||
for signal in self.step_signals:
|
||||
self.agent_episode_logger.create_signal_value(signal.name, signal.get_last_value())
|
||||
|
||||
# dump
|
||||
self.agent_episode_logger.dump_output_csv()
|
||||
|
||||
def update_log(self):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
self.agent_logger.set_current_time(self.current_episode)
|
||||
self.agent_logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
self.agent_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
|
||||
self.agent_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
|
||||
self.agent_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
|
||||
self.agent_logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||
self.agent_logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
self.agent_logger.create_signal_value("Epsilon", np.mean(self.exploration_policy.get_control_param()))
|
||||
self.agent_logger.create_signal_value("Shaped Training Reward", self.total_shaped_reward_in_current_episode
|
||||
if self._phase == RunPhase.TRAIN else np.nan)
|
||||
self.agent_logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
||||
if self._phase == RunPhase.TRAIN else np.nan)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
self.agent_logger.update_wall_clock_time(self.current_episode)
|
||||
|
||||
if self._phase != RunPhase.TEST:
|
||||
self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
|
||||
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
|
||||
|
||||
|
||||
for signal in self.episode_signals:
|
||||
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
self.agent_logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||
self.agent_logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||
|
||||
# dump
|
||||
if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
|
||||
and self.current_episode > 0:
|
||||
self.agent_logger.dump_output_csv()
|
||||
|
||||
def handle_episode_ended(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode_buffer.is_complete = True
|
||||
|
||||
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
|
||||
self.current_episode += 1
|
||||
|
||||
if self.phase != RunPhase.TEST and isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.call_memory('store_episode', self.current_episode_buffer)
|
||||
|
||||
if self.phase == RunPhase.TEST:
|
||||
self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
|
||||
self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
|
||||
self.num_evaluation_episodes_completed += 1
|
||||
|
||||
if self.spaces.reward.reward_success_threshold and \
|
||||
self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
|
||||
self.num_successes_across_evaluation_episodes += 1
|
||||
|
||||
if self.ap.visualization.dump_csv:
|
||||
self.update_log()
|
||||
|
||||
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
|
||||
self.log_to_screen()
|
||||
|
||||
def reset_internal_state(self):
|
||||
"""
|
||||
Reset all the episodic parameters
|
||||
:return: None
|
||||
"""
|
||||
for signal in self.episode_signals:
|
||||
signal.reset()
|
||||
for signal in self.step_signals:
|
||||
signal.reset()
|
||||
self.agent_episode_logger.set_episode_idx(self.current_episode)
|
||||
self.total_shaped_reward_in_current_episode = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.curr_state = {}
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
|
||||
if self.exploration_policy:
|
||||
self.exploration_policy.reset()
|
||||
self.input_filter.reset()
|
||||
self.output_filter.reset()
|
||||
self.pre_network_filter.reset()
|
||||
if isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.call_memory('verify_last_episode_is_closed')
|
||||
|
||||
for network in self.networks.values():
|
||||
network.online_network.reset_internal_memory()
|
||||
|
||||
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
|
||||
"""
|
||||
Given a batch of transitions, calculates their target values and updates the network.
|
||||
:param batch: A list of transitions
|
||||
:return: The total loss of the training, the loss per head and the unclipped gradients
|
||||
"""
|
||||
return 0, [], []
|
||||
|
||||
def _should_update_online_weights_to_target(self):
|
||||
"""
|
||||
Determine if online weights should be copied to the target.
|
||||
:return: boolean: True if the online weights should be copied to the target.
|
||||
"""
|
||||
# update the target network of every network that has a target network
|
||||
step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
|
||||
if step_method.__class__ == TrainingSteps:
|
||||
should_update = (self.training_iteration - self.last_target_network_update_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_target_network_update_step = self.training_iteration
|
||||
elif step_method.__class__ == EnvironmentSteps:
|
||||
should_update = (self.total_steps_counter - self.last_target_network_update_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_target_network_update_step = self.total_steps_counter
|
||||
else:
|
||||
raise ValueError("The num_steps_between_copying_online_weights_to_target parameter should be either "
|
||||
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def _should_train(self, wait_for_full_episode=False):
|
||||
"""
|
||||
Determine if we should start a training phase according to the number of steps passed since the last training
|
||||
:return: boolean: True if we should start a training phase
|
||||
"""
|
||||
step_method = self.ap.algorithm.num_consecutive_playing_steps
|
||||
if step_method.__class__ == EnvironmentEpisodes:
|
||||
should_update = (self.current_episode - self.last_training_phase_step) >= step_method.num_steps
|
||||
if should_update:
|
||||
self.last_training_phase_step = self.current_episode
|
||||
elif step_method.__class__ == EnvironmentSteps:
|
||||
should_update = (self.total_steps_counter - self.last_training_phase_step) >= step_method.num_steps
|
||||
if wait_for_full_episode:
|
||||
should_update = should_update and self.current_episode_steps_counter == 0
|
||||
if should_update:
|
||||
self.last_training_phase_step = self.total_steps_counter
|
||||
else:
|
||||
raise ValueError("The num_consecutive_playing_steps parameter should be either "
|
||||
"EnvironmentSteps or Episodes. Instead it is {}".format(step_method.__class__))
|
||||
return should_update
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
Check if a training phase should be done as configured by num_consecutive_playing_steps.
|
||||
If it should, then do several training steps as configured by num_consecutive_training_steps.
|
||||
A single training iteration: Sample a batch, train on it and update target networks.
|
||||
:return: The total training loss during the training iterations.
|
||||
"""
|
||||
loss = 0
|
||||
if self._should_train():
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
# TODO: this should be network dependent
|
||||
network_parameters = list(self.ap.network_wrappers.values())[0]
|
||||
|
||||
# update counters
|
||||
self.training_iteration += 1
|
||||
|
||||
# sample a batch and train on it
|
||||
batch = self.call_memory('sample', network_parameters.batch_size)
|
||||
if self.pre_network_filter is not None:
|
||||
batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False)
|
||||
|
||||
# if the batch returned empty then there are not enough samples in the replay buffer -> skip
|
||||
# training step
|
||||
if len(batch) > 0:
|
||||
# train
|
||||
batch = Batch(batch)
|
||||
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
|
||||
loss += total_loss
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
# TODO: the learning rate decay should be done through the network instead of here
|
||||
# decay learning rate
|
||||
if network_parameters.learning_rate_decay_rate != 0:
|
||||
self.curr_learning_rate.add_sample(self.networks['main'].sess.run(
|
||||
self.networks['main'].online_network.current_learning_rate))
|
||||
else:
|
||||
self.curr_learning_rate.add_sample(network_parameters.learning_rate)
|
||||
|
||||
if any([network.has_target for network in self.networks.values()]) \
|
||||
and self._should_update_online_weights_to_target():
|
||||
for network in self.networks.values():
|
||||
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
self.loss.add_sample(loss)
|
||||
|
||||
if self.imitation:
|
||||
self.log_to_screen()
|
||||
|
||||
# run additional commands after the training is done
|
||||
self.post_training_commands()
|
||||
|
||||
return loss
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
|
||||
:param curr_state: the current state to act upon.
|
||||
:return: chosen action, some action value describing the action (q-value, probability, etc)
|
||||
"""
|
||||
pass
|
||||
|
||||
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
|
||||
network_name: str):
|
||||
"""
|
||||
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
|
||||
observations together, measurements together, etc.
|
||||
"""
|
||||
# convert to batch so we can run it through the network
|
||||
states = force_list(states)
|
||||
batches_dict = {}
|
||||
for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
|
||||
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
|
||||
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
|
||||
# addition to the current_state, so that all the inputs of the network will be filled)
|
||||
if key in states[0].keys():
|
||||
batches_dict[key] = np.array([np.array(state[key]) for state in states])
|
||||
|
||||
return batches_dict
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Given the agents current knowledge, decide on the next action to apply to the environment
|
||||
:return: an action and a dictionary containing any additional info from the action decision process
|
||||
"""
|
||||
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
|
||||
# This agent never plays while training (e.g. behavioral cloning)
|
||||
return None
|
||||
|
||||
# count steps (only when training or if we are in the evaluation worker)
|
||||
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# decide on the action
|
||||
if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
|
||||
# random action
|
||||
self.last_action_info = self.spaces.action.sample_with_info()
|
||||
else:
|
||||
# informed action
|
||||
if self.pre_network_filter is not None:
|
||||
# before choosing an action, first use the pre_network_filter to filter out the current state
|
||||
curr_state = self.run_pre_network_filter_for_inference(self.curr_state)
|
||||
|
||||
else:
|
||||
curr_state = self.curr_state
|
||||
self.last_action_info = self.choose_action(curr_state)
|
||||
|
||||
filtered_action_info = self.output_filter.filter(self.last_action_info)
|
||||
|
||||
return filtered_action_info
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
|
||||
|
||||
def get_state_embedding(self, state: dict) -> np.ndarray:
|
||||
"""
|
||||
Given a state, get the corresponding state embedding from the main network
|
||||
:param state: a state dict
|
||||
:return: a numpy embedding vector
|
||||
"""
|
||||
# TODO: this won't work anymore
|
||||
# TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
|
||||
embedding = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(state, "main"),
|
||||
outputs=self.networks['main'].online_network.state_embedding)
|
||||
return embedding
|
||||
|
||||
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
|
||||
"""
|
||||
Allows agents to update the transition just before adding it to the replay buffer.
|
||||
Can be useful for agents that want to tweak the reward, termination signal, etc.
|
||||
:param transition: the transition to update
|
||||
:return: the updated transition
|
||||
"""
|
||||
return transition
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Given a response from the environment, distill the observation from it and store it for later use.
|
||||
The response should be a dictionary containing the performed action, the new observation and measurements,
|
||||
the reward, a game over flag and any additional information necessary.
|
||||
:param env_response: result of call from environment.step(action)
|
||||
:return:
|
||||
"""
|
||||
|
||||
# filter the env_response
|
||||
filtered_env_response = self.input_filter.filter(env_response)[0]
|
||||
|
||||
# inject agent collected statistics, if required
|
||||
if self.ap.algorithm.use_accumulated_reward_as_measurement:
|
||||
if 'measurements' in filtered_env_response.next_state:
|
||||
filtered_env_response.next_state['measurements'] = np.append(filtered_env_response.next_state['measurements'],
|
||||
self.total_shaped_reward_in_current_episode)
|
||||
else:
|
||||
filtered_env_response.next_state['measurements'] = np.array([self.total_shaped_reward_in_current_episode])
|
||||
|
||||
# if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
|
||||
# transition yet, and therefore we don't need to store anything in the memory.
|
||||
# also we did not reach the goal yet.
|
||||
if self.current_episode_steps_counter == 0:
|
||||
# initialize the current state
|
||||
self.curr_state = filtered_env_response.next_state
|
||||
return env_response.game_over
|
||||
else:
|
||||
transition = Transition(state=copy.copy(self.curr_state), action=self.last_action_info.action,
|
||||
reward=filtered_env_response.reward, next_state=filtered_env_response.next_state,
|
||||
game_over=filtered_env_response.game_over, info=filtered_env_response.info)
|
||||
|
||||
# now that we have formed a basic transition - the next state progresses to be the current state
|
||||
self.curr_state = filtered_env_response.next_state
|
||||
|
||||
# make agent specific changes to the transition if needed
|
||||
transition = self.update_transition_before_adding_to_replay_buffer(transition)
|
||||
|
||||
# merge the intrinsic reward in
|
||||
if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
|
||||
transition.reward = transition.reward * (1 + self.last_action_info.action_intrinsic_reward)
|
||||
else:
|
||||
transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward
|
||||
|
||||
# sum up the total shaped reward
|
||||
self.total_shaped_reward_in_current_episode += transition.reward
|
||||
self.total_reward_in_current_episode += env_response.reward
|
||||
self.shaped_reward.add_sample(transition.reward)
|
||||
self.reward.add_sample(env_response.reward)
|
||||
|
||||
# add action info to transition
|
||||
if type(self.parent).__name__ == 'CompositeAgent':
|
||||
transition.add_info(self.parent.last_action_info.__dict__)
|
||||
else:
|
||||
transition.add_info(self.last_action_info.__dict__)
|
||||
|
||||
# create and store the transition
|
||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
|
||||
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
|
||||
# for regular memories we insert the transitions directly to the memory
|
||||
if isinstance(self.memory, EpisodicExperienceReplay):
|
||||
self.current_episode_buffer.insert(transition)
|
||||
else:
|
||||
self.call_memory('store', transition)
|
||||
|
||||
if self.ap.visualization.dump_in_episode_signals:
|
||||
self.update_step_in_episode_log()
|
||||
|
||||
return transition.game_over
|
||||
|
||||
def post_training_commands(self):
|
||||
pass
|
||||
|
||||
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
|
||||
"""
|
||||
Get a prediction from the agent with regard to the requested prediction_type.
|
||||
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
|
||||
raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return:
|
||||
"""
|
||||
|
||||
predictions = self.networks['main'].online_network.predict_with_prediction_type(
|
||||
# states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
|
||||
states=states, prediction_type=prediction_type)
|
||||
|
||||
if len(predictions.keys()) != 1:
|
||||
raise ValueError("The network has more than one component {} matching the requested prediction_type {}. ".
|
||||
format(list(predictions.keys()), prediction_type))
|
||||
return list(predictions.values())[0]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
if isinstance(self.in_action_space, GoalsSpace):
|
||||
self.current_hrl_goal = action
|
||||
elif isinstance(self.in_action_space, AttentionActionSpace):
|
||||
self.input_filter.observation_filters['attention'].crop_low = action[0]
|
||||
self.input_filter.observation_filters['attention'].crop_high = action[1]
|
||||
self.output_filter.action_filters['masking'].set_masking(action[0], action[1])
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Allows agents to store additional information when saving checkpoints.
|
||||
:param checkpoint_id: the id of the checkpoint
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the global network parameters to local networks
|
||||
:return: None
|
||||
"""
|
||||
for network in self.networks.values():
|
||||
network.sync()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
125
rl_coach/agents/agent_interface.py
Normal file
125
rl_coach/agents/agent_interface.py
Normal file
@@ -0,0 +1,125 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union, List, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, PredictionType, ActionType
|
||||
|
||||
|
||||
class AgentInterface(object):
|
||||
def __init__(self):
|
||||
self._phase = RunPhase.HEATUP
|
||||
self._parent = None
|
||||
self.spaces = None
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
"""
|
||||
Get the phase of the agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase):
|
||||
"""
|
||||
Change the phase of the agent
|
||||
:param val: the new phase
|
||||
:return: None
|
||||
"""
|
||||
self._phase = val
|
||||
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset the episode parameters for the agent
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def train(self) -> Union[float, List]:
|
||||
"""
|
||||
Train the agents network
|
||||
:return: The loss of the training
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Get a decision of the next action to take.
|
||||
The action is dependent on the current state which the agent holds from resetting the environment or from
|
||||
the observe function.
|
||||
:return: A tuple containing the actual action and additional info on the action
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Gets a response from the environment.
|
||||
Processes this information for later use. For example, create a transition and store it in memory.
|
||||
The action info (a class containing any info the agent wants to store regarding its action decision process) is
|
||||
stored by the agent itself when deciding on the action.
|
||||
:param env_response: a EnvResponse containing the response from the environment
|
||||
:return: a done signal which is based on the agent knowledge. This can be different from the done signal from
|
||||
the environment. For example, an agent can decide to finish the episode each time it gets some
|
||||
intrinsic reward
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
"""
|
||||
Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
|
||||
:param checkpoint_id: the checkpoint id to use for saving
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def get_predictions(self, states: Dict, prediction_type: PredictionType) -> np.ndarray:
|
||||
"""
|
||||
Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
|
||||
type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
|
||||
:param states:
|
||||
:param prediction_type:
|
||||
:return: the agent's prediction
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
"""
|
||||
Pass a higher level command (directive) to the agent.
|
||||
For example, a higher level agent can set the goal of the agent.
|
||||
:param action: the directive to pass to the agent
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
81
rl_coach/agents/bc_agent.py
Normal file
81
rl_coach/agents/bc_agent.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.imitation_agent import ImitationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters, AlgorithmParameters, NetworkParameters, InputEmbedderParameters, \
|
||||
MiddlewareScheme
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class BCAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.collect_new_data = False
|
||||
|
||||
|
||||
class BCNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
|
||||
|
||||
class BCAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=BCAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": BCNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.bc_agent:BCAgent'
|
||||
|
||||
|
||||
# Behavioral Cloning Agent
|
||||
class BCAgent(ImitationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# When using a policy head, the targets refer to the advantages that we are normally feeding the head with.
|
||||
# In this case, we need the policy head to just predict probabilities, so while we usually train the network
|
||||
# with log(Pi)*Advantages, in this specific case we will train it to log(Pi), which after the softmax will
|
||||
# predict Pi (=probabilities)
|
||||
targets = np.ones(batch.actions().shape[0])
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
|
||||
'output_0_0': batch.actions()},
|
||||
targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
84
rl_coach/agents/bootstrapped_dqn_agent.py
Normal file
84
rl_coach/agents/bootstrapped_dqn_agent.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
|
||||
from rl_coach.exploration_policies.bootstrapped import BootstrappedParameters
|
||||
|
||||
|
||||
class BootstrappedDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_output_head_copies = 10
|
||||
self.rescale_gradient_from_head_by_factor = [1.0/self.num_output_head_copies]*self.num_output_head_copies
|
||||
|
||||
|
||||
class BootstrappedDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.network_wrappers = {"main": BootstrappedDQNNetworkParameters()}
|
||||
self.exploration = BootstrappedParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.bootstrapped_dqn_agent:BootstrappedDQNAgent'
|
||||
|
||||
|
||||
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
||||
class BootstrappedDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def reset_internal_state(self):
|
||||
super().reset_internal_state()
|
||||
self.exploration_policy.select_head()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
next_states_online_values = self.networks['main'].online_network.predict(batch.next_states(network_keys))
|
||||
result = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
q_st_plus_1 = result[:self.ap.exploration.architecture_num_q_heads]
|
||||
TD_targets = result[self.ap.exploration.architecture_num_q_heads:]
|
||||
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
mask = batch[i].info['mask']
|
||||
for head_idx in range(self.ap.exploration.architecture_num_q_heads):
|
||||
if mask[head_idx] == 1:
|
||||
selected_action = np.argmax(next_states_online_values[head_idx][i], 0)
|
||||
TD_targets[head_idx][i, batch.actions()[i]] = \
|
||||
batch.rewards()[i] + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount \
|
||||
* q_st_plus_1[head_idx][i][selected_action]
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def observe(self, env_response):
|
||||
mask = np.random.binomial(1, self.ap.exploration.bootstrapped_data_sharing_probability,
|
||||
self.ap.exploration.architecture_num_q_heads)
|
||||
env_response.info['mask'] = mask
|
||||
return super().observe(env_response)
|
||||
114
rl_coach/agents/categorical_dqn_agent.py
Normal file
114
rl_coach/agents/categorical_dqn_agent.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.dqn_agent import DQNNetworkParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.categorical_q_head import CategoricalQHeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import StateType
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class CategoricalDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.heads_parameters = [CategoricalQHeadParameters()]
|
||||
|
||||
|
||||
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.v_min = -10.0
|
||||
self.v_max = 10.0
|
||||
self.atoms = 51
|
||||
|
||||
|
||||
class CategoricalDQNExplorationParameters(EGreedyParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.evaluation_epsilon = 0.001
|
||||
|
||||
|
||||
class CategoricalDQNAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
|
||||
exploration=CategoricalDQNExplorationParameters(),
|
||||
memory=ExperienceReplayParameters(),
|
||||
networks={"main": CategoricalDQNNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.categorical_dqn_agent:CategoricalDQNAgent'
|
||||
|
||||
|
||||
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||
class CategoricalDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)
|
||||
|
||||
def distribution_prediction_to_q_values(self, prediction):
|
||||
return np.dot(prediction, self.z_values)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
prediction = self.get_prediction(states)
|
||||
q_values = self.distribution_prediction_to_q_values(prediction)
|
||||
else:
|
||||
q_values = None
|
||||
return q_values
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the action we actually took, the error is calculated by the atoms distribution
|
||||
# for all other actions, the error is 0
|
||||
distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
|
||||
m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
|
||||
|
||||
batches = np.arange(self.ap.network_wrappers['main'].batch_size)
|
||||
for j in range(self.z_values.size):
|
||||
tzj = np.fmax(np.fmin(batch.rewards() +
|
||||
(1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
|
||||
self.z_values[self.z_values.size - 1]),
|
||||
self.z_values[0])
|
||||
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
|
||||
u = (np.ceil(bj)).astype(int)
|
||||
l = (np.floor(bj)).astype(int)
|
||||
m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
|
||||
m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
|
||||
# total_loss = cross entropy between actual result above and predicted result for the given action
|
||||
TD_targets[batches, batch.actions()] = m
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
277
rl_coach/agents/clipped_ppo_agent.py
Normal file
277
rl_coach/agents/clipped_ppo_agent.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from random import shuffle
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, Batch, EnvResponse, StateType
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class ClippedPPONetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
|
||||
self.loss_weights = [1.0, 1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1, 1]
|
||||
self.batch_size = 64
|
||||
self.optimizer_type = 'Adam'
|
||||
self.clip_gradients = None
|
||||
self.use_separate_networks_per_head = True
|
||||
self.async_training = False
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
|
||||
|
||||
|
||||
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_episodes_in_experience_replay = 1000000
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
self.gae_lambda = 0.95
|
||||
self.use_kl_regularization = False
|
||||
self.clip_likelihood_ratio_using_epsilon = 0.2
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.beta_entropy = 0.01 # should be 0 for mujoco
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
|
||||
self.optimization_epochs = 10
|
||||
self.normalization_stats = None
|
||||
self.clipping_decay_schedule = ConstantSchedule(1)
|
||||
|
||||
|
||||
class ClippedPPOAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": ClippedPPONetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.clipped_ppo_agent:ClippedPPOAgent'
|
||||
|
||||
|
||||
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
||||
class ClippedPPOAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
# signals definition
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
self.value_targets = self.register_signal('Value Targets')
|
||||
self.kl_divergence = self.register_signal('KL Divergence')
|
||||
self.likelihood_ratio = self.register_signal('Likelihood Ratio')
|
||||
self.clipped_likelihood_ratio = self.register_signal('Clipped Likelihood Ratio')
|
||||
|
||||
|
||||
def set_session(self, sess):
|
||||
super().set_session(sess)
|
||||
if self.ap.algorithm.normalization_stats is not None:
|
||||
self.ap.algorithm.normalization_stats.set_session(sess)
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
current_state_values = self.networks['main'].online_network.predict(batch.states(network_keys))[0]
|
||||
current_state_values = current_state_values.squeeze()
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
value_targets = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = batch.total_returns() - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
value_targets = np.array([])
|
||||
for idx, game_over in enumerate(batch.game_overs()):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, gae_based_value_targets = \
|
||||
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
value_targets = np.append(value_targets, gae_based_value_targets)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
for transition, advantage, value_target in zip(batch.transitions, advantages, value_targets):
|
||||
transition.info['advantage'] = advantage
|
||||
transition.info['gae_based_value_target'] = value_target
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_network(self, batch, epochs):
|
||||
batch_results = []
|
||||
for j in range(epochs):
|
||||
batch.shuffle()
|
||||
batch_results = {
|
||||
'total_loss': [],
|
||||
'losses': [],
|
||||
'unclipped_grads': [],
|
||||
'kl_divergence': [],
|
||||
'entropy': []
|
||||
}
|
||||
|
||||
fetches = [self.networks['main'].online_network.output_heads[1].kl_divergence,
|
||||
self.networks['main'].online_network.output_heads[1].entropy,
|
||||
self.networks['main'].online_network.output_heads[1].likelihood_ratio,
|
||||
self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
|
||||
|
||||
for i in range(int(batch.size / self.ap.network_wrappers['main'].batch_size)):
|
||||
start = i * self.ap.network_wrappers['main'].batch_size
|
||||
end = (i + 1) * self.ap.network_wrappers['main'].batch_size
|
||||
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
actions = batch.actions()[start:end]
|
||||
gae_based_value_targets = batch.info('gae_based_value_target')[start:end]
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
|
||||
# TODO-perf - the target network ("old_policy") is not changing. this can be calculated once for all epochs.
|
||||
# the shuffling being done, should only be performed on the indices.
|
||||
result = self.networks['main'].target_network.predict({k: v[start:end] for k, v in batch.states(network_keys).items()})
|
||||
old_policy_distribution = result[1:]
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
if self.ap.algorithm.estimate_state_value_using_gae:
|
||||
value_targets = np.expand_dims(gae_based_value_targets, -1)
|
||||
else:
|
||||
value_targets = batch.total_returns(expand_dims=True)[start:end]
|
||||
|
||||
inputs = copy.copy({k: v[start:end] for k, v in batch.states(network_keys).items()})
|
||||
inputs['output_1_0'] = actions
|
||||
|
||||
# The old_policy_distribution needs to be represented as a list, because in the event of
|
||||
# discrete controls, it has just a mean. otherwise, it has both a mean and standard deviation
|
||||
for input_index, input in enumerate(old_policy_distribution):
|
||||
inputs['output_1_{}'.format(input_index + 1)] = input
|
||||
|
||||
inputs['output_1_3'] = self.ap.algorithm.clipping_decay_schedule.current_value
|
||||
|
||||
total_loss, losses, unclipped_grads, fetch_result = \
|
||||
self.networks['main'].train_and_sync_networks(
|
||||
inputs, [value_targets, batch.info('advantage')[start:end]], additional_fetches=fetches
|
||||
)
|
||||
|
||||
batch_results['total_loss'].append(total_loss)
|
||||
batch_results['losses'].append(losses)
|
||||
batch_results['unclipped_grads'].append(unclipped_grads)
|
||||
batch_results['kl_divergence'].append(fetch_result[0])
|
||||
batch_results['entropy'].append(fetch_result[1])
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
self.value_targets.add_sample(value_targets)
|
||||
self.likelihood_ratio.add_sample(fetch_result[2])
|
||||
self.clipped_likelihood_ratio.add_sample(fetch_result[3])
|
||||
|
||||
for key in batch_results.keys():
|
||||
batch_results[key] = np.mean(batch_results[key], 0)
|
||||
|
||||
self.value_loss.add_sample(batch_results['losses'][0])
|
||||
self.policy_loss.add_sample(batch_results['losses'][1])
|
||||
|
||||
if self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.networks['main'].online_network.get_variable_value(
|
||||
self.networks['main'].online_network.adaptive_learning_rate_scheme)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.ap.network_wrappers['main'].learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", batch_results['losses'][1]),
|
||||
("KL divergence", batch_results['kl_divergence']),
|
||||
("Entropy", batch_results['entropy']),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = batch_results['kl_divergence']
|
||||
self.entropy.add_sample(batch_results['entropy'])
|
||||
self.kl_divergence.add_sample(batch_results['kl_divergence'])
|
||||
return batch_results['losses']
|
||||
|
||||
def post_training_commands(self):
|
||||
# clean memory
|
||||
self.call_memory('clean')
|
||||
|
||||
def train(self):
|
||||
if self._should_train(wait_for_full_episode=True):
|
||||
dataset = self.memory.transitions
|
||||
dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
|
||||
batch = Batch(dataset)
|
||||
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
self.networks['main'].sync()
|
||||
self.fill_advantages(batch)
|
||||
|
||||
# take only the requested number of steps
|
||||
if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
|
||||
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
|
||||
shuffle(dataset)
|
||||
batch = Batch(dataset)
|
||||
|
||||
self.train_network(batch, self.ap.algorithm.optimization_epochs)
|
||||
|
||||
self.post_training_commands()
|
||||
self.training_iteration += 1
|
||||
# self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return None
|
||||
|
||||
def run_pre_network_filter_for_inference(self, state: StateType):
|
||||
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
|
||||
return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
self.ap.algorithm.clipping_decay_schedule.step()
|
||||
return super().choose_action(curr_state)
|
||||
415
rl_coach/agents/composite_agent.py
Normal file
415
rl_coach/agents/composite_agent.py
Normal file
@@ -0,0 +1,415 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
import itertools
|
||||
from enum import Enum
|
||||
from typing import Union, List, Dict
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.agent_interface import AgentInterface
|
||||
from rl_coach.base_parameters import AgentParameters, VisualizationParameters
|
||||
# from rl_coach.environments.environment_interface import ActionSpace
|
||||
from rl_coach.spaces import ActionSpace
|
||||
from rl_coach.spaces import AgentSelection, AttentionActionSpace, ObservationSpace, SpacesDefinition
|
||||
from rl_coach.utils import short_dynamic_import
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvResponse, ActionType, RunPhase
|
||||
from rl_coach.filters.observation.observation_crop_filter import ObservationCropFilter
|
||||
|
||||
|
||||
class DecisionPolicy(object):
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, decide on a single action to take.
|
||||
:param actions_info: a dictionary of agent names and their corresponding
|
||||
ActionInfo instances containing information for each agents action
|
||||
:return: a single action and the corresponding action info
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
|
||||
class SingleDecider(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action according to the agent that is currently in control.
|
||||
"""
|
||||
def __init__(self, default_decision_maker: str):
|
||||
super().__init__()
|
||||
self._decision_maker = default_decision_maker
|
||||
|
||||
@property
|
||||
def decision_maker(self):
|
||||
"""
|
||||
Get the decision maker that was set by the upper level control.
|
||||
"""
|
||||
return self._decision_maker
|
||||
|
||||
@decision_maker.setter
|
||||
def decision_maker(self, decision_maker: str):
|
||||
"""
|
||||
Set the decision maker by the upper level control.
|
||||
:param action: the incoming action from the upper level control.
|
||||
"""
|
||||
self._decision_maker = decision_maker
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action of the current decision maker
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
if self.decision_maker not in actions_info.keys():
|
||||
raise ValueError("The current decision maker ({}) does not exist in the given actions ({})"
|
||||
.format(self.decision_maker, actions_info.keys()))
|
||||
return actions_info[self.decision_maker]
|
||||
|
||||
|
||||
class RoundRobin(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action according to agents selected in a circular order.
|
||||
"""
|
||||
def __init__(self, num_agents: int):
|
||||
super().__init__()
|
||||
self.round_robin = itertools.cycle(range(num_agents))
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action of the current decision maker, which is set in a
|
||||
circular order
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
decision_maker = self.round_robin.__next__()
|
||||
if decision_maker not in range(len(actions_info.keys())):
|
||||
raise ValueError("The size of action_info does not match the number of agents set to RoundRobin decision"
|
||||
" policy.")
|
||||
return actions_info.items()[decision_maker]
|
||||
|
||||
|
||||
class MajorityVote(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that chooses the action that most of the agents chose.
|
||||
This policy is only useful for discrete control.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the action that most agents agree on
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
# TODO: enforce discrete action spaces
|
||||
if len(actions_info.keys()) == 0:
|
||||
raise ValueError("The given list of actions is empty")
|
||||
vote_count = np.bincount([action_info.action for action_info in actions_info.values()])
|
||||
majority_vote = np.argmax(vote_count)
|
||||
return actions_info.items()[majority_vote]
|
||||
|
||||
|
||||
class MeanDecision(DecisionPolicy):
|
||||
"""
|
||||
A decision policy that takes the mean action given the actions of all the agents.
|
||||
This policy is only useful for continuous control.
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
|
||||
"""
|
||||
Given a list of actions from multiple agents, take the mean action
|
||||
:param actions_info: a list of ActionInfo instances containing the information for each agents action
|
||||
:return: a single action
|
||||
"""
|
||||
# TODO: enforce continuous action spaces
|
||||
if len(actions_info.keys()) == 0:
|
||||
raise ValueError("The given list of actions is empty")
|
||||
mean = np.mean([action_info.action for action_info in actions_info.values()], axis=0)
|
||||
return ActionInfo(mean)
|
||||
|
||||
|
||||
class RewardPolicy(Enum):
|
||||
ReachingGoal = 0
|
||||
NativeEnvironmentReward = 1
|
||||
AccumulatedEnvironmentRewards = 2
|
||||
|
||||
|
||||
class CompositeAgent(AgentInterface):
|
||||
"""
|
||||
A CompositeAgent is a group of agents in the same hierarchy level.
|
||||
In a CompositeAgent, each agent may take the role of either a controller or an observer.
|
||||
Each agent that is defined as observer, gets observations from the environment.
|
||||
Each agent that is defined as controller, can potentially also control the environment, in addition to observing it.
|
||||
There are several ways to decide on the action from different controller agents:
|
||||
1. Ensemble -
|
||||
- Take the majority vote (discrete controls)
|
||||
- Take the mean action (continuous controls)
|
||||
- Round robin between the agents (discrete/continuous)
|
||||
2. Skills -
|
||||
- At each step a single agent decides (Chosen by the uppoer hierarchy controlling agent)
|
||||
|
||||
A CompositeAgent can be controlled using one of the following methods (ActionSpaces):
|
||||
1. Goals (in terms of measurements, observation, embedding or a change in those values)
|
||||
2. Agent Selection (skills) / Discrete action space.
|
||||
3. Attention (a subset of the real environment observation / action space)
|
||||
"""
|
||||
def __init__(self,
|
||||
agents_parameters: Union[AgentParameters, Dict[str, AgentParameters]],
|
||||
visualization_parameters: VisualizationParameters,
|
||||
decision_policy: DecisionPolicy,
|
||||
out_action_space: ActionSpace,
|
||||
in_action_space: Union[None, ActionSpace]=None,
|
||||
decision_makers: Union[bool, Dict[str, bool]]=True,
|
||||
reward_policy: RewardPolicy=RewardPolicy.NativeEnvironmentReward,
|
||||
name="CompositeAgent"):
|
||||
"""
|
||||
Construct an agent group
|
||||
:param agents_parameters: a list of presets describing each one of the agents in the group
|
||||
:param decision_policy: the decision policy of the group which describes how actions are consolidated
|
||||
:param out_action_space: the type of action space that is used by this composite agent in order to control the
|
||||
underlying environment
|
||||
:param in_action_space: the type of action space that is used by the upper level agent in order to control this
|
||||
group
|
||||
:param decision_makers: a list of booleans representing for each corresponding agent if it has a decision
|
||||
privilege or if it is just an observer
|
||||
:param reward_policy: the type of the reward that the group receives
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if isinstance(agents_parameters, AgentParameters):
|
||||
decision_makers = {agents_parameters.name: True}
|
||||
agents_parameters = {agents_parameters.name: agents_parameters}
|
||||
self.agents_parameters = agents_parameters
|
||||
self.visualization_parameters = visualization_parameters
|
||||
self.decision_makers = decision_makers
|
||||
self.decision_policy = decision_policy
|
||||
self.in_action_space = in_action_space
|
||||
self.out_action_space = out_action_space # TODO: this is not being used
|
||||
self.reward_policy = reward_policy
|
||||
self.full_name_id = self.name = name
|
||||
self.current_decision_maker = 0
|
||||
self.environment = None
|
||||
self.agents = {} # key = agent_name, value = agent
|
||||
self.incoming_action = None
|
||||
self.last_state = None
|
||||
self._phase = RunPhase.HEATUP
|
||||
self.last_action_info = None
|
||||
self.current_episode = 0
|
||||
self.parent_level_manager = None
|
||||
|
||||
# environment spaces
|
||||
self.spaces = None
|
||||
|
||||
# counters for logging
|
||||
self.total_steps_counter = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
|
||||
# validate input
|
||||
if set(self.decision_makers) != set(self.agents_parameters):
|
||||
raise ValueError("The decision_makers dictionary keys does not match the names of the given agents")
|
||||
if sum(self.decision_makers.values()) > 1 and type(self.decision_policy) == SingleDecider \
|
||||
and type(self.in_action_space) != AgentSelection:
|
||||
raise ValueError("When the control policy is set to single decider, the master policy should control the"
|
||||
"agent group via agent selection (ControlType.AgentSelection)")
|
||||
|
||||
@property
|
||||
def parent(self):
|
||||
"""
|
||||
Get the parent class of the composite agent
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._parent
|
||||
|
||||
@parent.setter
|
||||
def parent(self, val):
|
||||
"""
|
||||
Change the parent class of the composite agent.
|
||||
Additionally, updates the full name of the agent
|
||||
:param val: the new parent
|
||||
:return: None
|
||||
"""
|
||||
self._parent = val
|
||||
if not hasattr(self._parent, 'name'):
|
||||
raise ValueError("The parent of a composite agent must have a name")
|
||||
self.full_name_id = "{}/{}".format(self._parent.name, self.name)
|
||||
|
||||
def create_agents(self):
|
||||
for agent_name, agent_parameters in self.agents_parameters.items():
|
||||
agent_parameters.name = agent_name
|
||||
|
||||
# create agent
|
||||
self.agents[agent_parameters.name] = short_dynamic_import(agent_parameters.path)(agent_parameters,
|
||||
parent=self)
|
||||
self.agents[agent_parameters.name].parent_level_manager = self.parent_level_manager
|
||||
|
||||
# TODO: this is a bit too specific to be defined here
|
||||
# add an attention cropping filter if the incoming directives are attention boxes
|
||||
if isinstance(self.in_action_space, AttentionActionSpace):
|
||||
attention_size = self.in_action_space.forced_attention_size
|
||||
for agent in self.agents.values():
|
||||
agent.input_filter.observation_filters['attention'] = \
|
||||
ObservationCropFilter(crop_low=np.zeros_like(attention_size), crop_high=attention_size)
|
||||
agent.input_filter.observation_filters.move_to_end('attention', last=False) # add the cropping at the beginning
|
||||
|
||||
def setup_logger(self) -> None:
|
||||
"""
|
||||
Setup the logger for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
[agent.setup_logger() for agent in self.agents.values()]
|
||||
|
||||
def set_session(self, sess) -> None:
|
||||
"""
|
||||
Set the deep learning framework session for all the agents in the composite agent
|
||||
:return: None
|
||||
"""
|
||||
[agent.set_session(sess) for agent in self.agents.values()]
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
"""
|
||||
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
|
||||
dependent on those values, by calling init_environment_dependent_modules
|
||||
:param spaces: the definitions of all the spaces of the environment
|
||||
:return: None
|
||||
"""
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
[agent.set_environment_parameters(self.spaces) for agent in self.agents.values()]
|
||||
|
||||
@property
|
||||
def phase(self):
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase) -> None:
|
||||
"""
|
||||
Change the current phase of all the agents in the group
|
||||
:param phase: the new phase
|
||||
:return: None
|
||||
"""
|
||||
self._phase = val
|
||||
for agent in self.agents.values():
|
||||
agent.phase = val
|
||||
|
||||
def end_episode(self) -> None:
|
||||
"""
|
||||
End an episode
|
||||
:return: None
|
||||
"""
|
||||
self.current_episode += 1
|
||||
[agent.handle_episode_ended() for agent in self.agents.values()]
|
||||
|
||||
def reset_internal_state(self) -> None:
|
||||
"""
|
||||
Reset the episode for all the agents in the group
|
||||
:return: None
|
||||
"""
|
||||
# update counters
|
||||
self.total_steps_counter = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.total_reward_in_current_episode = 0
|
||||
|
||||
# reset all sub modules
|
||||
[agent.reset_internal_state() for agent in self.agents.values()]
|
||||
|
||||
def train(self) -> Union[float, List]:
|
||||
"""
|
||||
Make a single training step for all the agents of the group
|
||||
:return: a list of loss values from the training step
|
||||
"""
|
||||
return [agent.train() for agent in self.agents.values()]
|
||||
|
||||
def act(self) -> ActionInfo:
|
||||
"""
|
||||
Get the actions from all the agents in the group. Then use the decision policy in order to
|
||||
extract a single action out of the list of actions.
|
||||
:return: the chosen action and its corresponding information
|
||||
"""
|
||||
|
||||
# update counters
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# get the actions info from all the agents
|
||||
actions_info = {}
|
||||
for agent_name, agent in self.agents.items():
|
||||
action_info = agent.act()
|
||||
actions_info[agent_name] = action_info
|
||||
|
||||
# decide on a single action to apply to the environment
|
||||
action_info = self.decision_policy.choose_action(actions_info)
|
||||
|
||||
# TODO: make the last action info a property?
|
||||
# pass the action info to all the observers
|
||||
for agent_name, is_decision_maker in self.decision_makers.items():
|
||||
if not is_decision_maker:
|
||||
self.agents[agent_name].last_action_info = action_info
|
||||
self.last_action_info = action_info
|
||||
|
||||
return self.last_action_info
|
||||
|
||||
def observe(self, env_response: EnvResponse) -> bool:
|
||||
"""
|
||||
Given a response from the environment as a env_response, filter it and pass it to the agents.
|
||||
This method has two main jobs:
|
||||
1. Wrap the previous transition, ending with the new observation coming from EnvResponse.
|
||||
2. Save the next_state as the current_state to take action upon for the next call to act().
|
||||
|
||||
:param env_response:
|
||||
:param action_info: additional info about the chosen action
|
||||
:return:
|
||||
"""
|
||||
|
||||
# accumulate the unfiltered rewards for visualization
|
||||
self.total_reward_in_current_episode += env_response.reward
|
||||
|
||||
episode_ended = env_response.game_over
|
||||
|
||||
# pass the env_response to all the sub-agents
|
||||
# TODO: what if one agent decides to end the episode but the others don't? who decides?
|
||||
for agent_name, agent in self.agents.items():
|
||||
goal_reached = agent.observe(env_response)
|
||||
episode_ended = episode_ended or goal_reached
|
||||
|
||||
# TODO: unlike for a single agent, here we also treat a game over by the environment.
|
||||
# probably better to only return the agents' goal_reached decisions.
|
||||
return episode_ended
|
||||
|
||||
def save_checkpoint(self, checkpoint_id: int) -> None:
|
||||
[agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
|
||||
|
||||
def set_incoming_directive(self, action: ActionType) -> None:
|
||||
self.incoming_action = action
|
||||
if isinstance(self.decision_policy, SingleDecider) and isinstance(self.in_action_space, AgentSelection):
|
||||
self.decision_policy.decision_maker = list(self.agents.keys())[action]
|
||||
if isinstance(self.in_action_space, AttentionActionSpace):
|
||||
# TODO: redesign to be more modular
|
||||
for agent in self.agents.values():
|
||||
agent.input_filter.observation_filters['attention'].crop_low = action[0]
|
||||
agent.input_filter.observation_filters['attention'].crop_high = action[1]
|
||||
agent.output_filter.action_filters['masking'].set_masking(action[0], action[1])
|
||||
|
||||
# TODO rethink this scheme. we don't want so many if else clauses lying around here.
|
||||
# TODO - for incoming actions which do not involve setting the acting agent we should change the
|
||||
# observation_space, goal to pursue, etc accordingly to the incoming action.
|
||||
|
||||
def sync(self) -> None:
|
||||
"""
|
||||
Sync the agent networks with the global network
|
||||
:return:
|
||||
"""
|
||||
[agent.sync() for agent in self.agents.values()]
|
||||
192
rl_coach/agents/ddpg_agent.py
Normal file
192
rl_coach/agents/ddpg_agent.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
|
||||
AgentParameters, InputEmbedderParameters, EmbedderScheme
|
||||
from rl_coach.exploration_policies.ou_process import OUProcessParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import BoxActionSpace, GoalsSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ddpg_actor_head import DDPGActorHeadParameters
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps
|
||||
|
||||
|
||||
class DDPGCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True),
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [VHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class DDPGActorNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True)}
|
||||
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
|
||||
self.heads_parameters = [DDPGActorHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 64
|
||||
self.async_training = False
|
||||
self.learning_rate = 0.0001
|
||||
self.create_target_network = True
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
|
||||
|
||||
|
||||
class DDPGAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
self.rate_for_copying_weights_to_target = 0.001
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
self.use_target_network_for_evaluation = False
|
||||
self.action_penalty = 0
|
||||
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
|
||||
self.use_non_zero_discount_for_terminal_states = False
|
||||
|
||||
|
||||
class DDPGAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DDPGAlgorithmParameters(),
|
||||
exploration=OUProcessParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"actor": DDPGActorNetworkParameters(),
|
||||
"critic": DDPGCriticNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ddpg_agent:DDPGAgent'
|
||||
|
||||
|
||||
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
||||
class DDPGAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.q_values = self.register_signal("Q")
|
||||
self.TD_targets_signal = self.register_signal("TD targets")
|
||||
self.action_signal = self.register_signal("actions")
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
actor = self.networks['actor']
|
||||
critic = self.networks['critic']
|
||||
|
||||
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
|
||||
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
next_actions, actions_mean = actor.parallel_prediction([
|
||||
(actor.target_network, batch.next_states(actor_keys)),
|
||||
(actor.online_network, batch.states(actor_keys))
|
||||
])
|
||||
|
||||
critic_inputs = copy.copy(batch.next_states(critic_keys))
|
||||
critic_inputs['action'] = next_actions
|
||||
q_st_plus_1 = critic.target_network.predict(critic_inputs)
|
||||
|
||||
# calculate the bootstrapped TD targets while discounting terminal states according to
|
||||
# use_non_zero_discount_for_terminal_states
|
||||
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
|
||||
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
|
||||
else:
|
||||
TD_targets = batch.rewards(expand_dims=True) + \
|
||||
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
|
||||
|
||||
# clip the TD targets to prevent overestimation errors
|
||||
if self.ap.algorithm.clip_critic_targets:
|
||||
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
|
||||
|
||||
self.TD_targets_signal.add_sample(TD_targets)
|
||||
|
||||
# get the gradients of the critic output with respect to the action
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = actions_mean
|
||||
action_gradients = critic.online_network.predict(critic_inputs,
|
||||
outputs=critic.online_network.gradients_wrt_inputs[0]['action'])
|
||||
|
||||
# train the critic
|
||||
critic_inputs = copy.copy(batch.states(critic_keys))
|
||||
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
|
||||
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
# apply the gradients from the critic to the actor
|
||||
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
|
||||
gradients = actor.online_network.predict(batch.states(actor_keys),
|
||||
outputs=actor.online_network.weighted_gradients[0],
|
||||
initial_feed_dict=initial_feed_dict)
|
||||
|
||||
if actor.has_global:
|
||||
actor.apply_gradients_to_global_network(gradients)
|
||||
actor.update_online_network()
|
||||
else:
|
||||
actor.apply_gradients_to_online_network(gradients)
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def train(self):
|
||||
return Agent.train(self)
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if not (isinstance(self.spaces.action, BoxActionSpace) or isinstance(self.spaces.action, GoalsSpace)):
|
||||
raise ValueError("DDPG works only for continuous control problems")
|
||||
# convert to batch so we can run it through the network
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
|
||||
if self.ap.algorithm.use_target_network_for_evaluation:
|
||||
actor_network = self.networks['actor'].target_network
|
||||
else:
|
||||
actor_network = self.networks['actor'].online_network
|
||||
|
||||
action_values = actor_network.predict(tf_input_state).squeeze()
|
||||
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
self.action_signal.add_sample(action)
|
||||
|
||||
# get q value
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'critic')
|
||||
action_batch = np.expand_dims(action, 0)
|
||||
if type(action) != np.ndarray:
|
||||
action_batch = np.array([[action]])
|
||||
tf_input_state['action'] = action_batch
|
||||
q_value = self.networks['critic'].online_network.predict(tf_input_state)[0]
|
||||
self.q_values.add_sample(q_value)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_value=q_value)
|
||||
|
||||
return action_info
|
||||
69
rl_coach/agents/ddqn_agent.py
Normal file
69
rl_coach/agents/ddqn_agent.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
|
||||
|
||||
class DDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.001
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ddqn_agent:DDQNAgent'
|
||||
|
||||
|
||||
# Double DQN - https://arxiv.org/abs/1509.06461
|
||||
class DDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
TD_errors = []
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
new_target = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
|
||||
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
|
||||
TD_targets[i, batch.actions()[i]] = new_target
|
||||
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
|
||||
importance_weights=importance_weights)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
219
rl_coach/agents/dfp_agent.py
Normal file
219
rl_coach/agents/dfp_agent.py
Normal file
@@ -0,0 +1,219 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from enum import Enum
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
|
||||
from rl_coach.architectures.tensorflow_components.heads.measurements_prediction_head import MeasurementsPredictionHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
|
||||
InputEmbedderParameters, MiddlewareScheme
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class HandlingTargetsAfterEpisodeEnd(Enum):
|
||||
LastStep = 0
|
||||
NAN = 1
|
||||
|
||||
|
||||
class DFPNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
|
||||
'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
|
||||
'goal': InputEmbedderParameters(activation_function='leaky_relu')}
|
||||
|
||||
self.input_embedders_parameters['observation'].scheme = [
|
||||
Conv2d([32, 8, 4]),
|
||||
Conv2d([64, 4, 2]),
|
||||
Conv2d([64, 3, 1]),
|
||||
Dense([512]),
|
||||
]
|
||||
|
||||
self.input_embedders_parameters['measurements'].scheme = [
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
]
|
||||
|
||||
self.input_embedders_parameters['goal'].scheme = [
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
]
|
||||
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
|
||||
scheme=MiddlewareScheme.Empty)
|
||||
self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = False
|
||||
self.batch_size = 64
|
||||
self.adam_optimizer_beta1 = 0.95
|
||||
|
||||
|
||||
class DFPMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
def __init__(self):
|
||||
self.max_size = (MemoryGranularity.Transitions, 20000)
|
||||
self.shared_memory = True
|
||||
super().__init__()
|
||||
|
||||
|
||||
class DFPAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_predicted_steps_ahead = 6
|
||||
self.goal_vector = [1.0, 1.0]
|
||||
self.future_measurements_weights = [0.5, 0.5, 1.0]
|
||||
self.use_accumulated_reward_as_measurement = False
|
||||
self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
|
||||
self.scale_measurements_targets = {}
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(8)
|
||||
|
||||
|
||||
class DFPAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DFPAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=DFPMemoryParameters(),
|
||||
networks={"main": DFPNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.dfp_agent:DFPAgent'
|
||||
|
||||
|
||||
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
||||
class DFPAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.current_goal = self.ap.algorithm.goal_vector
|
||||
self.target_measurements_scale_factors = None
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
network_inputs = batch.states(network_keys)
|
||||
network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
|
||||
self.ap.network_wrappers['main'].batch_size, axis=0)
|
||||
|
||||
# get the current outputs of the network
|
||||
targets = self.networks['main'].online_network.predict(network_inputs)
|
||||
|
||||
# change the targets for the taken actions
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
# predict the future measurements
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
|
||||
tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
|
||||
measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
|
||||
action_values = np.zeros(len(self.spaces.action.actions))
|
||||
num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)
|
||||
|
||||
# calculate the score of each action by multiplying it's future measurements with the goal vector
|
||||
for action_idx in range(len(self.spaces.action.actions)):
|
||||
action_measurements = measurements_future_prediction[action_idx]
|
||||
action_measurements = np.reshape(action_measurements,
|
||||
(self.ap.algorithm.num_predicted_steps_ahead,
|
||||
self.spaces.state['measurements'].shape[0]))
|
||||
future_steps_values = np.dot(action_measurements, self.current_goal)
|
||||
action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
|
||||
self.ap.algorithm.future_measurements_weights)
|
||||
else:
|
||||
action_values = None
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
if action_values is not None:
|
||||
action_values = action_values.squeeze()
|
||||
action_info = ActionInfo(action=action, action_value=action_values[action])
|
||||
else:
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
self.spaces = copy.deepcopy(spaces)
|
||||
self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
|
||||
measurements_names=
|
||||
self.spaces.state['measurements'].measurements_names)
|
||||
|
||||
# if the user has filled some scale values, check that he got the names right
|
||||
if set(self.spaces.state['measurements'].measurements_names).intersection(
|
||||
self.ap.algorithm.scale_measurements_targets.keys()) !=\
|
||||
set(self.ap.algorithm.scale_measurements_targets.keys()):
|
||||
raise ValueError("Some of the keys in parameter scale_measurements_targets ({}) are not defined in "
|
||||
"the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
|
||||
self.spaces.state['measurements'].measurements_names))
|
||||
|
||||
super().set_environment_parameters(self.spaces)
|
||||
|
||||
# the below is done after calling the base class method, as it might add accumulated reward as a measurement
|
||||
|
||||
# fill out the missing measurements scale factors
|
||||
for measurement_name in self.spaces.state['measurements'].measurements_names:
|
||||
if measurement_name not in self.ap.algorithm.scale_measurements_targets:
|
||||
self.ap.algorithm.scale_measurements_targets[measurement_name] = 1
|
||||
|
||||
self.target_measurements_scale_factors = \
|
||||
np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
|
||||
self.spaces.state['measurements'].measurements_names])
|
||||
|
||||
def handle_episode_ended(self):
|
||||
last_episode = self.current_episode_buffer
|
||||
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
|
||||
self._update_measurements_targets(last_episode,
|
||||
self.ap.algorithm.num_predicted_steps_ahead)
|
||||
super().handle_episode_ended()
|
||||
|
||||
def _update_measurements_targets(self, episode, num_steps):
|
||||
if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
|
||||
raise ValueError("Measurements are not present in the transitions of the last episode played. ")
|
||||
measurements_size = self.spaces.state['measurements'].shape[0]
|
||||
for transition_idx, transition in enumerate(episode.transitions):
|
||||
transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
|
||||
for step in range(num_steps):
|
||||
offset_idx = transition_idx + 2 ** step
|
||||
|
||||
if offset_idx >= episode.length():
|
||||
if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
|
||||
# the special MSE loss will ignore those entries so that the gradient will be 0 for these
|
||||
transition.info['future_measurements'][step] = np.nan
|
||||
continue
|
||||
|
||||
elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
|
||||
offset_idx = - 1
|
||||
|
||||
transition.info['future_measurements'][step] = \
|
||||
self.target_measurements_scale_factors * \
|
||||
(episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])
|
||||
99
rl_coach/agents/dqn_agent.py
Normal file
99
rl_coach/agents/dqn_agent.py
Normal file
@@ -0,0 +1,99 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
|
||||
InputEmbedderParameters, MiddlewareScheme
|
||||
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
|
||||
class DQNAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
self.discount = 0.99
|
||||
|
||||
|
||||
class DQNNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
|
||||
self.heads_parameters = [QHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class DQNAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=DQNAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=ExperienceReplayParameters(),
|
||||
networks={"main": DQNNetworkParameters()})
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.05
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.dqn_agent:DQNAgent'
|
||||
|
||||
|
||||
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
||||
class DQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the action we actually took, the error is:
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
# # for all other actions, the error is 0
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
TD_errors = []
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
new_target = batch.rewards()[i] +\
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
|
||||
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
|
||||
TD_targets[i, batch.actions()[i]] = new_target
|
||||
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
|
||||
importance_weights=importance_weights)
|
||||
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
108
rl_coach/agents/hac_ddpg_agent.py
Normal file
108
rl_coach/agents/hac_ddpg_agent.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import copy
|
||||
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
|
||||
from rl_coach.core_types import RunPhase
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.time_limit = 40
|
||||
self.sub_goal_testing_rate = 0.5
|
||||
|
||||
|
||||
class HACDDPGAgentParameters(DDPGAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = HACDDPGAlgorithmParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.hac_ddpg_agent:HACDDPGAgent'
|
||||
|
||||
|
||||
# Hierarchical Actor Critic Generating Subgoals DDPG Agent - https://arxiv.org/pdf/1712.00948.pdf
|
||||
class HACDDPGAgent(DDPGAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
|
||||
self.graph_manager = None
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
|
||||
# testing phase
|
||||
|
||||
graph_manager = self.parent_level_manager.parent_graph_manager
|
||||
if self.ap.is_a_highest_level_agent:
|
||||
graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
|
||||
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
if graph_manager.should_test_current_sub_goal:
|
||||
self.exploration_policy.change_phase(RunPhase.TEST)
|
||||
else:
|
||||
self.exploration_policy.change_phase(self.phase)
|
||||
|
||||
action_info = super().choose_action(curr_state)
|
||||
return action_info
|
||||
|
||||
def update_transition_before_adding_to_replay_buffer(self, transition):
|
||||
graph_manager = self.parent_level_manager.parent_graph_manager
|
||||
|
||||
# deal with goals given from a higher level agent
|
||||
if not self.ap.is_a_highest_level_agent:
|
||||
transition.state['desired_goal'] = self.current_hrl_goal
|
||||
transition.next_state['desired_goal'] = self.current_hrl_goal
|
||||
# TODO: allow setting goals which are not part of the state. e.g. state-embedding using get_prediction
|
||||
self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
|
||||
self.current_hrl_goal, transition.next_state))
|
||||
goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
|
||||
self.current_hrl_goal, transition.next_state)
|
||||
transition.reward = goal_reward
|
||||
transition.game_over = transition.game_over or sub_goal_reached
|
||||
|
||||
# each level tests its own generated sub goals
|
||||
if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
|
||||
#TODO-fixme
|
||||
# _, sub_goal_reached = self.parent_level_manager.environment.agents['agent_1'].spaces.goal.\
|
||||
# get_reward_for_goal_and_state(transition.action, transition.next_state)
|
||||
|
||||
_, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
|
||||
transition.action, transition.next_state)
|
||||
|
||||
sub_goal_is_missed = not sub_goal_reached
|
||||
|
||||
if sub_goal_is_missed:
|
||||
transition.reward = -self.ap.algorithm.time_limit
|
||||
return transition
|
||||
|
||||
def set_environment_parameters(self, spaces: SpacesDefinition):
|
||||
super().set_environment_parameters(spaces)
|
||||
|
||||
if self.ap.is_a_highest_level_agent:
|
||||
# the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
|
||||
# their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
|
||||
self.spaces.goal = self.spaces.action
|
||||
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
|
||||
|
||||
if not self.ap.is_a_highest_level_agent:
|
||||
self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward
|
||||
115
rl_coach/agents/human_agent.py
Normal file
115
rl_coach/agents/human_agent.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
import pygame
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.agents.bc_agent import BCNetworkParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, InputEmbedderParameters, EmbedderScheme, \
|
||||
AgentParameters
|
||||
from rl_coach.core_types import ActionInfo
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from pandas import to_pickle
|
||||
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class HumanAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
class HumanNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
|
||||
|
||||
class HumanAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=HumanAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": BCNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.human_agent:HumanAgent'
|
||||
|
||||
|
||||
class HumanAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.clock = pygame.time.Clock()
|
||||
self.max_fps = int(self.ap.visualization.max_fps_for_human_control)
|
||||
self.env = None
|
||||
|
||||
def init_environment_dependent_modules(self):
|
||||
super().init_environment_dependent_modules()
|
||||
self.env = self.parent_level_manager._real_environment
|
||||
screen.log_title("Human Control Mode")
|
||||
available_keys = self.env.get_available_keys()
|
||||
if available_keys:
|
||||
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
||||
screen.log("")
|
||||
for action, key in self.env.get_available_keys():
|
||||
screen.log("\t- {}: {}".format(action, key))
|
||||
screen.separator()
|
||||
|
||||
def train(self):
|
||||
return 0
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
action = ActionInfo(self.env.get_action_from_user(), action_value=0)
|
||||
action = self.output_filter.reverse_filter(action)
|
||||
|
||||
# keep constant fps
|
||||
self.clock.tick(self.max_fps)
|
||||
|
||||
if not self.env.renderer.is_open:
|
||||
self.save_replay_buffer_and_exit()
|
||||
|
||||
return action
|
||||
|
||||
def save_replay_buffer_and_exit(self):
|
||||
replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p')
|
||||
self.memory.tp = None
|
||||
to_pickle(self.memory, replay_buffer_path)
|
||||
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
||||
exit()
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
screen.log_dict(log, prefix="Recording")
|
||||
76
rl_coach/agents/imitation_agent.py
Normal file
76
rl_coach/agents/imitation_agent.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
from rl_coach.core_types import RunPhase, ActionInfo
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
# Imitation Agent
|
||||
class ImitationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.imitation = True
|
||||
|
||||
def extract_action_values(self, prediction):
|
||||
return prediction.squeeze()
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# convert to batch so we can run it through the network
|
||||
prediction = self.networks['main'].online_network.predict(self.prepare_batch_for_inference(curr_state, 'main'))
|
||||
|
||||
# get action values and extract the best action from it
|
||||
action_values = self.extract_action_values(prediction)
|
||||
if type(self.spaces.action) == DiscreteActionSpace:
|
||||
# DISCRETE
|
||||
self.exploration_policy.phase = RunPhase.TEST
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_probability=action_values[action])
|
||||
else:
|
||||
# CONTINUOUS
|
||||
action = action_values
|
||||
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
if self.phase == RunPhase.TRAIN:
|
||||
# for the training phase - we log during the episode to visualize the progress in training
|
||||
log = OrderedDict()
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Loss"] = self.loss.values[-1]
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix="Training")
|
||||
else:
|
||||
# for the evaluation phase - logging as in regular RL
|
||||
super().log_to_screen()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("ImitationAgent is an abstract agent. Not to be used directly.")
|
||||
72
rl_coach/agents/mmc_agent.py
Normal file
72
rl_coach/agents/mmc_agent.py
Normal file
@@ -0,0 +1,72 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
|
||||
|
||||
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
|
||||
class MixedMonteCarloAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = MixedMonteCarloAlgorithmParameters()
|
||||
self.memory = EpisodicExperienceReplayParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.mmc_agent:MixedMonteCarloAgent'
|
||||
|
||||
|
||||
class MixedMonteCarloAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# for the 1-step, we use the double-dqn target. hence actions are taken greedily according to the online network
|
||||
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
|
||||
|
||||
# TD_targets are initialized with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
one_step_target = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
|
||||
q_st_plus_1[i][selected_actions[i]]
|
||||
monte_carlo_target = batch.total_returns()[i]
|
||||
TD_targets[i, batch.actions()[i]] = (1 - self.mixing_rate) * one_step_target + \
|
||||
self.mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
126
rl_coach/agents/n_step_q_agent.py
Normal file
126
rl_coach/agents/n_step_q_agent.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
|
||||
InputEmbedderParameters
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.utils import last_sample
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class NStepQNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [QHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.async_training = True
|
||||
self.shared_optimizer = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class NStepQAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
self.apply_gradients_every_x_episodes = 1
|
||||
self.num_steps_between_gradient_updates = 5 # this is called t_max in all the papers
|
||||
self.targets_horizon = 'N-Step'
|
||||
|
||||
|
||||
class NStepQAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NStepQAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": NStepQNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.n_step_q_agent:NStepQAgent'
|
||||
|
||||
|
||||
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
||||
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.q_values = self.register_signal('Q Values')
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the values for the current states
|
||||
state_value_head_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
|
||||
# the targets for the state value estimator
|
||||
if self.ap.algorithm.targets_horizon == '1-Step':
|
||||
# 1-Step Q learning
|
||||
q_st_plus_1 = self.networks['main'].target_network.predict(batch.next_states(network_keys))
|
||||
|
||||
for i in reversed(range(batch.size)):
|
||||
state_value_head_targets[i][batch.actions()[i]] = \
|
||||
batch.rewards()[i] \
|
||||
+ (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
|
||||
|
||||
elif self.ap.algorithm.targets_horizon == 'N-Step':
|
||||
# N-Step Q learning
|
||||
if batch.game_overs()[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = np.max(self.networks['main'].target_network.predict(last_sample(batch.next_states(network_keys))))
|
||||
|
||||
for i in reversed(range(batch.size)):
|
||||
R = batch.rewards()[i] + self.ap.algorithm.discount * R
|
||||
state_value_head_targets[i][batch.actions()[i]] = R
|
||||
|
||||
else:
|
||||
assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
|
||||
|
||||
# train
|
||||
result = self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys), [state_value_head_targets])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.value_loss.add_sample(losses[0])
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def train(self):
|
||||
# update the target network of every network that has a target network
|
||||
if any([network.has_target for network in self.networks.values()]) \
|
||||
and self._should_update_online_weights_to_target():
|
||||
for network in self.networks.values():
|
||||
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
|
||||
|
||||
self.agent_logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return PolicyOptimizationAgent.train(self)
|
||||
126
rl_coach/agents/naf_agent.py
Normal file
126
rl_coach/agents/naf_agent.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.naf_head import NAFHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, \
|
||||
NetworkParameters, InputEmbedderParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import BoxActionSpace
|
||||
|
||||
from rl_coach.core_types import ActionInfo, EnvironmentSteps
|
||||
from rl_coach.exploration_policies.ou_process import OUProcessParameters
|
||||
|
||||
|
||||
class NAFNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [NAFHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.learning_rate = 0.001
|
||||
self.async_training = True
|
||||
self.create_target_network = True
|
||||
|
||||
|
||||
class NAFAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_consecutive_training_steps = 5
|
||||
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
|
||||
self.rate_for_copying_weights_to_target = 0.001
|
||||
|
||||
|
||||
class NAFAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NAFAlgorithmParameters(),
|
||||
exploration=OUProcessParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"main": NAFNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.naf_agent:NAFAgent'
|
||||
|
||||
|
||||
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
||||
class NAFAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.l_values = self.register_signal("L")
|
||||
self.a_values = self.register_signal("Advantage")
|
||||
self.mu_values = self.register_signal("Action")
|
||||
self.v_values = self.register_signal("V")
|
||||
self.TD_targets = self.register_signal("TD targets")
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# TD error = r + discount*v_st_plus_1 - q_st
|
||||
v_st_plus_1 = self.networks['main'].target_network.predict(
|
||||
batch.next_states(network_keys),
|
||||
self.networks['main'].target_network.output_heads[0].V,
|
||||
squeeze_output=False,
|
||||
)
|
||||
TD_targets = np.expand_dims(batch.rewards(), -1) + \
|
||||
(1.0 - np.expand_dims(batch.game_overs(), -1)) * self.ap.algorithm.discount * v_st_plus_1
|
||||
|
||||
self.TD_targets.add_sample(TD_targets)
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
|
||||
'output_0_0': batch.actions(len(batch.actions().shape) == 1)
|
||||
}, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
if type(self.spaces.action) != BoxActionSpace:
|
||||
raise ValueError('NAF works only for continuous control problems')
|
||||
|
||||
# convert to batch so we can run it through the network
|
||||
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
|
||||
naf_head = self.networks['main'].online_network.output_heads[0]
|
||||
action_values = self.networks['main'].online_network.predict(tf_input_state, outputs=naf_head.mu,
|
||||
squeeze_output=False)
|
||||
|
||||
# get the actual action to use
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
# get the internal values for logging
|
||||
outputs = [naf_head.mu, naf_head.Q, naf_head.L, naf_head.A, naf_head.V]
|
||||
result = self.networks['main'].online_network.predict(
|
||||
{**tf_input_state, 'output_0_0': action_values},
|
||||
outputs=outputs
|
||||
)
|
||||
mu, Q, L, A, V = result
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(Q)
|
||||
self.l_values.add_sample(L)
|
||||
self.a_values.add_sample(A)
|
||||
self.mu_values.add_sample(mu)
|
||||
self.v_values.add_sample(V)
|
||||
|
||||
action_info = ActionInfo(action=action, action_value=Q)
|
||||
|
||||
return action_info
|
||||
176
rl_coach/agents/nec_agent.py
Normal file
176
rl_coach/agents/nec_agent.py
Normal file
@@ -0,0 +1,176 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.architectures.tensorflow_components.heads.dnd_q_head import DNDQHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
|
||||
InputEmbedderParameters
|
||||
from rl_coach.core_types import RunPhase, EnvironmentSteps, Episode, StateType
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class NECNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [DNDQHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.optimizer_type = 'Adam'
|
||||
|
||||
|
||||
class NECAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.dnd_size = 500000
|
||||
self.l2_norm_added_delta = 0.001
|
||||
self.new_value_shift_coefficient = 0.1
|
||||
self.number_of_knn = 50
|
||||
self.DND_key_error_threshold = 0
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
self.propagate_updates_to_DND = False
|
||||
self.n_step = 100
|
||||
self.bootstrap_total_return_from_old_policy = True
|
||||
|
||||
|
||||
class NECMemoryParameters(EpisodicExperienceReplayParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_size = (MemoryGranularity.Transitions, 100000)
|
||||
|
||||
|
||||
class NECAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=NECAlgorithmParameters(),
|
||||
exploration=EGreedyParameters(),
|
||||
memory=NECMemoryParameters(),
|
||||
networks={"main": NECNetworkParameters()})
|
||||
self.exploration.epsilon_schedule = ConstantSchedule(0.1)
|
||||
self.exploration.evaluation_epsilon = 0.01
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.nec_agent:NECAgent'
|
||||
|
||||
|
||||
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
||||
class NECAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.current_episode_state_embeddings = []
|
||||
self.training_started = False
|
||||
self.current_episode_buffer = \
|
||||
Episode(discount=self.ap.algorithm.discount,
|
||||
n_step=self.ap.algorithm.n_step,
|
||||
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn):
|
||||
return 0, [], 0
|
||||
else:
|
||||
if not self.training_started:
|
||||
self.training_started = True
|
||||
screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
|
||||
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
TD_targets[i, batch.actions()[i]] = batch.total_returns()[i]
|
||||
|
||||
# set the gradients to fetch for the DND update
|
||||
fetches = []
|
||||
head = self.networks['main'].online_network.output_heads[0]
|
||||
if self.ap.algorithm.propagate_updates_to_DND:
|
||||
fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices]
|
||||
|
||||
# train the neural network
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches)
|
||||
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
# update the DND keys and values using the extracted gradients
|
||||
if self.ap.algorithm.propagate_updates_to_DND:
|
||||
embedding_gradients = np.swapaxes(result[-1][0], 0, 1)
|
||||
value_gradients = np.swapaxes(result[-1][1], 0, 1)
|
||||
indices = np.swapaxes(result[-1][2], 0, 1)
|
||||
head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices)
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
def act(self):
|
||||
if self.phase == RunPhase.HEATUP:
|
||||
# get embedding in heatup (otherwise we get it through get_prediction)
|
||||
embedding = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(self.curr_state, 'main'),
|
||||
outputs=self.networks['main'].online_network.state_embedding)
|
||||
self.current_episode_state_embeddings.append(embedding)
|
||||
|
||||
return super().act()
|
||||
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
# we need to store the state embeddings regardless if the action is random or not
|
||||
return self.get_prediction(states)
|
||||
|
||||
def get_prediction(self, states):
|
||||
# get the actions q values and the state embedding
|
||||
embedding, actions_q_values = self.networks['main'].online_network.predict(
|
||||
self.prepare_batch_for_inference(states, 'main'),
|
||||
outputs=[self.networks['main'].online_network.state_embedding,
|
||||
self.networks['main'].online_network.output_heads[0].output]
|
||||
)
|
||||
if self.phase != RunPhase.TEST:
|
||||
# store the state embedding for inserting it to the DND later
|
||||
self.current_episode_state_embeddings.append(embedding.squeeze())
|
||||
actions_q_values = actions_q_values[0][0]
|
||||
return actions_q_values
|
||||
|
||||
def reset_internal_state(self):
|
||||
super().reset_internal_state()
|
||||
self.current_episode_state_embeddings = []
|
||||
self.current_episode_buffer = \
|
||||
Episode(discount=self.ap.algorithm.discount,
|
||||
n_step=self.ap.algorithm.n_step,
|
||||
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
|
||||
|
||||
def handle_episode_ended(self):
|
||||
super().handle_episode_ended()
|
||||
|
||||
# get the last full episode that we have collected
|
||||
episode = self.call_memory('get_last_complete_episode')
|
||||
if episode is not None and self.phase != RunPhase.TEST:
|
||||
assert len(self.current_episode_state_embeddings) == episode.length()
|
||||
returns = episode.get_transitions_attribute('total_return')
|
||||
actions = episode.get_transitions_attribute('action')
|
||||
self.networks['main'].online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
|
||||
actions, returns)
|
||||
|
||||
def save_checkpoint(self, checkpoint_id):
|
||||
with open(os.path.join(self.ap.task_parameters.save_checkpoint_dir, str(checkpoint_id) + '.dnd'), 'wb') as f:
|
||||
pickle.dump(self.networks['main'].online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL)
|
||||
94
rl_coach/agents/pal_agent.py
Normal file
94
rl_coach/agents/pal_agent.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay, \
|
||||
EpisodicExperienceReplayParameters
|
||||
|
||||
|
||||
class PALAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.pal_alpha = 0.9
|
||||
self.persistent_advantage_learning = False
|
||||
self.monte_carlo_mixing_rate = 0.1
|
||||
|
||||
|
||||
class PALAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = PALAlgorithmParameters()
|
||||
self.memory = EpisodicExperienceReplayParameters()
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.pal_agent:PALAgent'
|
||||
|
||||
|
||||
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
||||
class PALAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.alpha = agent_parameters.algorithm.pal_alpha
|
||||
self.persistent = agent_parameters.algorithm.persistent_advantage_learning
|
||||
self.monte_carlo_mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# next state values
|
||||
q_st_plus_1_target, q_st_plus_1_online = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.next_states(network_keys))
|
||||
])
|
||||
selected_actions = np.argmax(q_st_plus_1_online, 1)
|
||||
v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
|
||||
|
||||
# current state values
|
||||
q_st_target, q_st_online = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
v_st_target = np.max(q_st_target, 1)
|
||||
|
||||
# calculate TD error
|
||||
TD_targets = np.copy(q_st_online)
|
||||
for i in range(self.ap.network_wrappers['main'].batch_size):
|
||||
TD_targets[i, batch.actions()[i]] = batch.rewards()[i] + \
|
||||
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
|
||||
q_st_plus_1_target[i][selected_actions[i]]
|
||||
advantage_learning_update = v_st_target[i] - q_st_target[i, batch.actions()[i]]
|
||||
next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
|
||||
# Persistent Advantage Learning or Regular Advantage Learning
|
||||
if self.persistent:
|
||||
TD_targets[i, batch.actions()[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
|
||||
else:
|
||||
TD_targets[i, batch.actions()[i]] -= self.alpha * advantage_learning_update
|
||||
|
||||
# mixing monte carlo updates
|
||||
monte_carlo_target = batch.total_returns()[i]
|
||||
TD_targets[i, batch.actions()[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, batch.actions()[i]] \
|
||||
+ self.monte_carlo_mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
105
rl_coach/agents/policy_gradients_agent.py
Normal file
105
rl_coach/agents/policy_gradients_agent.py
Normal file
@@ -0,0 +1,105 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
|
||||
AgentParameters, InputEmbedderParameters
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.logger import screen
|
||||
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
|
||||
|
||||
|
||||
class PolicyGradientNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
|
||||
self.middleware_parameters = FCMiddlewareParameters()
|
||||
self.heads_parameters = [PolicyHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
|
||||
|
||||
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.beta_entropy = 0
|
||||
self.num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
|
||||
|
||||
|
||||
class PolicyGradientsAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=SingleEpisodeBufferParameters(),
|
||||
networks={"main": PolicyGradientNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.policy_gradients_agent:PolicyGradientsAgent'
|
||||
|
||||
|
||||
class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.returns_mean = self.register_signal('Returns Mean')
|
||||
self.returns_variance = self.register_signal('Returns Variance')
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
total_returns = batch.total_returns()
|
||||
for i in reversed(range(batch.size)):
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
|
||||
total_returns[i] = total_returns[0]
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
|
||||
# just take the total return as it is
|
||||
pass
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
||||
if self.std_discounted_return != 0:
|
||||
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
||||
else:
|
||||
total_returns[i] = 0
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
targets = total_returns
|
||||
actions = batch.actions()
|
||||
if type(self.spaces.action) != DiscreteActionSpace and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
self.returns_mean.add_sample(np.mean(total_returns))
|
||||
self.returns_variance.add_sample(np.std(total_returns))
|
||||
|
||||
result = self.networks['main'].online_network.accumulate_gradients(
|
||||
{**batch.states(network_keys), 'output_0_0': actions}, targets
|
||||
)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
166
rl_coach/agents/policy_optimization_agent.py
Normal file
166
rl_coach/agents/policy_optimization_agent.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from collections import OrderedDict
|
||||
from enum import Enum
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.core_types import Batch, ActionInfo
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class PolicyGradientRescaler(Enum):
|
||||
TOTAL_RETURN = 0
|
||||
FUTURE_RETURN = 1
|
||||
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
||||
FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined
|
||||
Q_VALUE = 4
|
||||
A_VALUE = 5
|
||||
TD_RESIDUAL = 6
|
||||
DISCOUNTED_TD_RESIDUAL = 7
|
||||
GAE = 8
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
|
||||
class PolicyOptimizationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
self.policy_gradient_rescaler = None
|
||||
if hasattr(self.ap.algorithm, 'policy_gradient_rescaler'):
|
||||
self.policy_gradient_rescaler = self.ap.algorithm.policy_gradient_rescaler
|
||||
|
||||
# statistics for variance reduction
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.max_episode_length = 100000
|
||||
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
||||
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
||||
self.entropy = self.register_signal('Entropy')
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Name"] = self.full_name_id
|
||||
if self.task_id is not None:
|
||||
log["Worker"] = self.task_id
|
||||
log["Episode"] = self.current_episode
|
||||
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
log["Training iteration"] = self.training_iteration
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
def update_episode_statistics(self, episode):
|
||||
episode_discounted_returns = []
|
||||
for i in range(episode.length()):
|
||||
transition = episode.get_transition(i)
|
||||
episode_discounted_returns.append(transition.total_return)
|
||||
self.num_episodes_where_step_has_been_seen[i] += 1
|
||||
self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_return_over_multiple_episodes[i] += transition.total_return / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_discounted_return = np.mean(episode_discounted_returns)
|
||||
self.std_discounted_return = np.std(episode_discounted_returns)
|
||||
|
||||
def get_current_episode(self):
|
||||
# we get the episode most of the time from the current episode buffer and only in the last transition from the
|
||||
# "memory" (where is was stored in the end of the episode)
|
||||
return self.memory.get_episode(0) or self.current_episode_buffer
|
||||
|
||||
def train(self):
|
||||
episode = self.get_current_episode()
|
||||
|
||||
# check if we should calculate gradients or skip
|
||||
episode_ended = episode.is_complete
|
||||
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
|
||||
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
|
||||
if not (is_t_max_steps_passed or episode_ended):
|
||||
return 0
|
||||
|
||||
total_loss = 0
|
||||
if num_steps_passed_since_last_update > 0:
|
||||
|
||||
# we need to update the returns of the episode until now
|
||||
episode.update_returns()
|
||||
|
||||
# get t_max transitions or less if the we got to a terminal state
|
||||
# will be used for both actor-critic and vanilla PG.
|
||||
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
|
||||
transitions = []
|
||||
start_idx = self.last_gradient_update_step_idx
|
||||
end_idx = episode.length()
|
||||
|
||||
for idx in range(start_idx, end_idx):
|
||||
transitions.append(episode.get_transition(idx))
|
||||
self.last_gradient_update_step_idx = end_idx
|
||||
|
||||
# update the statistics for the variance reduction techniques
|
||||
if self.policy_gradient_rescaler in \
|
||||
[PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
|
||||
PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
|
||||
self.update_episode_statistics(episode)
|
||||
|
||||
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
|
||||
batch = Batch(transitions)
|
||||
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
|
||||
if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
|
||||
for network in self.networks.values():
|
||||
network.apply_gradients_and_sync_networks()
|
||||
self.training_iteration += 1
|
||||
|
||||
# move the pointer to the next episode start and discard the episode.
|
||||
if episode_ended:
|
||||
# we need to remove the episode, because the next training iteration will be called before storing any
|
||||
# additional transitions in the memory (we don't store a transition for the first call to observe), so the
|
||||
# length of the memory won't be enforced and the old episode won't be removed
|
||||
self.call_memory('remove_episode', 0)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
return total_loss
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("PolicyOptimizationAgent is an abstract agent. Not to be used directly.")
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "main")
|
||||
return self.networks['main'].online_network.predict(tf_input_state)
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
# convert to batch so we can run it through the network
|
||||
action_values = self.get_prediction(curr_state)
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
# DISCRETE
|
||||
action_probabilities = np.array(action_values).squeeze()
|
||||
action = self.exploration_policy.get_action(action_probabilities)
|
||||
action_info = ActionInfo(action=action,
|
||||
action_probability=action_probabilities[action])
|
||||
|
||||
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
|
||||
elif isinstance(self.spaces.action, BoxActionSpace):
|
||||
# CONTINUOUS
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
|
||||
action_info = ActionInfo(action=action)
|
||||
else:
|
||||
raise ValueError("The action space of the environment is not compatible with the algorithm")
|
||||
return action_info
|
||||
338
rl_coach/agents/ppo_agent.py
Normal file
338
rl_coach/agents/ppo_agent.py
Normal file
@@ -0,0 +1,338 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from collections import OrderedDict
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
|
||||
AgentParameters, InputEmbedderParameters, DistributedTaskParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, Batch
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class PPOCriticNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [VHeadParameters()]
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.batch_size = 128
|
||||
|
||||
|
||||
class PPOActorNetworkParameters(NetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
|
||||
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
|
||||
self.heads_parameters = [PPOHeadParameters()]
|
||||
self.optimizer_type = 'Adam'
|
||||
self.loss_weights = [1.0]
|
||||
self.async_training = True
|
||||
self.l2_regularization = 0
|
||||
self.create_target_network = True
|
||||
self.batch_size = 128
|
||||
|
||||
|
||||
class PPOAlgorithmParameters(AlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
self.gae_lambda = 0.96
|
||||
self.target_kl_divergence = 0.01
|
||||
self.initial_kl_coefficient = 1.0
|
||||
self.high_kl_penalty_coefficient = 1000
|
||||
self.clip_likelihood_ratio_using_epsilon = None
|
||||
self.value_targets_mix_fraction = 0.1
|
||||
self.estimate_state_value_using_gae = True
|
||||
self.step_until_collecting_full_episodes = True
|
||||
self.use_kl_regularization = True
|
||||
self.beta_entropy = 0.01
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
|
||||
|
||||
|
||||
class PPOAgentParameters(AgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__(algorithm=PPOAlgorithmParameters(),
|
||||
exploration=AdditiveNoiseParameters(),
|
||||
memory=EpisodicExperienceReplayParameters(),
|
||||
networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.ppo_agent:PPOAgent'
|
||||
|
||||
|
||||
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
|
||||
class PPOAgent(ActorCriticAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
|
||||
# signals definition
|
||||
self.value_loss = self.register_signal('Value Loss')
|
||||
self.policy_loss = self.register_signal('Policy Loss')
|
||||
self.kl_divergence = self.register_signal('KL Divergence')
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = self.register_signal('Grads (unclipped)')
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
batch = Batch(batch)
|
||||
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# * Found not to have any impact *
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(batch)
|
||||
|
||||
current_state_values = self.networks['critic'].online_network.predict(batch.states(network_keys)).squeeze()
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = batch.total_returns() - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
# current_state_values[batch.game_overs()] = 0
|
||||
for idx, game_over in enumerate(batch.game_overs()):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, _ = \
|
||||
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
# TODO: this will be problematic with a shared memory
|
||||
for transition, advantage in zip(self.memory.transitions, advantages):
|
||||
transition.info['advantage'] = advantage
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_value_network(self, dataset, epochs):
|
||||
loss = []
|
||||
batch = Batch(dataset)
|
||||
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
|
||||
|
||||
# * Found not to have any impact *
|
||||
# add a timestep to the observation
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(dataset)
|
||||
|
||||
mix_fraction = self.ap.algorithm.value_targets_mix_fraction
|
||||
for j in range(epochs):
|
||||
curr_batch_size = batch.size
|
||||
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
|
||||
curr_batch_size = self.ap.network_wrappers['critic'].batch_size
|
||||
for i in range(batch.size // curr_batch_size):
|
||||
# split to batches for first order optimization techniques
|
||||
current_states_batch = {
|
||||
k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
|
||||
for k, v in batch.states(network_keys).items()
|
||||
}
|
||||
total_return_batch = batch.total_returns(True)[i * curr_batch_size:(i + 1) * curr_batch_size]
|
||||
old_policy_values = force_list(self.networks['critic'].target_network.predict(
|
||||
current_states_batch).squeeze())
|
||||
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
|
||||
targets = total_return_batch
|
||||
else:
|
||||
current_values = self.networks['critic'].online_network.predict(current_states_batch)
|
||||
targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
|
||||
|
||||
inputs = copy.copy(current_states_batch)
|
||||
for input_index, input in enumerate(old_policy_values):
|
||||
name = 'output_0_{}'.format(input_index)
|
||||
if name in self.networks['critic'].online_network.inputs:
|
||||
inputs[name] = input
|
||||
|
||||
value_loss = self.networks['critic'].online_network.accumulate_gradients(inputs, targets)
|
||||
|
||||
self.networks['critic'].apply_gradients_to_online_network()
|
||||
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
self.networks['critic'].apply_gradients_to_global_network()
|
||||
self.networks['critic'].online_network.reset_accumulated_gradients()
|
||||
|
||||
loss.append([value_loss[0]])
|
||||
loss = np.mean(loss, 0)
|
||||
return loss
|
||||
|
||||
def concat_state_and_timestep(self, dataset):
|
||||
current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
|
||||
for transition in dataset]
|
||||
current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
|
||||
return current_states_with_timestep
|
||||
|
||||
def train_policy_network(self, dataset, epochs):
|
||||
loss = []
|
||||
for j in range(epochs):
|
||||
loss = {
|
||||
'total_loss': [],
|
||||
'policy_losses': [],
|
||||
'unclipped_grads': [],
|
||||
'fetch_result': []
|
||||
}
|
||||
#shuffle(dataset)
|
||||
for i in range(len(dataset) // self.ap.network_wrappers['actor'].batch_size):
|
||||
batch = Batch(dataset[i * self.ap.network_wrappers['actor'].batch_size:
|
||||
(i + 1) * self.ap.network_wrappers['actor'].batch_size])
|
||||
|
||||
network_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
|
||||
|
||||
advantages = batch.info('advantage')
|
||||
actions = batch.actions()
|
||||
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
old_policy = force_list(self.networks['actor'].target_network.predict(batch.states(network_keys)))
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
fetches = [self.networks['actor'].online_network.output_heads[0].kl_divergence,
|
||||
self.networks['actor'].online_network.output_heads[0].entropy]
|
||||
|
||||
inputs = copy.copy(batch.states(network_keys))
|
||||
inputs['output_0_0'] = actions
|
||||
|
||||
# old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
|
||||
# it has just a mean. otherwise, it has both a mean and standard deviation
|
||||
for input_index, input in enumerate(old_policy):
|
||||
inputs['output_0_{}'.format(input_index + 1)] = input
|
||||
|
||||
total_loss, policy_losses, unclipped_grads, fetch_result =\
|
||||
self.networks['actor'].online_network.accumulate_gradients(
|
||||
inputs, [advantages], additional_fetches=fetches)
|
||||
|
||||
self.networks['actor'].apply_gradients_to_online_network()
|
||||
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
self.networks['actor'].apply_gradients_to_global_network()
|
||||
|
||||
self.networks['actor'].online_network.reset_accumulated_gradients()
|
||||
|
||||
loss['total_loss'].append(total_loss)
|
||||
loss['policy_losses'].append(policy_losses)
|
||||
loss['unclipped_grads'].append(unclipped_grads)
|
||||
loss['fetch_result'].append(fetch_result)
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
for key in loss.keys():
|
||||
loss[key] = np.mean(loss[key], 0)
|
||||
|
||||
if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.networks['critic'].online_network.get_variable_value(self.ap.learning_rate)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.ap.network_wrappers['critic'].learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
|
||||
self.entropy.add_sample(loss['fetch_result'][1])
|
||||
self.kl_divergence.add_sample(loss['fetch_result'][0])
|
||||
return loss['total_loss']
|
||||
|
||||
def update_kl_coefficient(self):
|
||||
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
||||
# his implementation for now because we know it works well
|
||||
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||
|
||||
# update kl coefficient
|
||||
kl_target = self.ap.algorithm.target_kl_divergence
|
||||
kl_coefficient = self.networks['actor'].online_network.get_variable_value(
|
||||
self.networks['actor'].online_network.output_heads[0].kl_coefficient)
|
||||
new_kl_coefficient = kl_coefficient
|
||||
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
|
||||
# kl too high => increase regularization
|
||||
new_kl_coefficient *= 1.5
|
||||
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
|
||||
# kl too low => decrease regularization
|
||||
new_kl_coefficient /= 1.5
|
||||
|
||||
# update the kl coefficient variable
|
||||
if kl_coefficient != new_kl_coefficient:
|
||||
self.networks['actor'].online_network.set_variable_value(
|
||||
self.networks['actor'].online_network.output_heads[0].assign_kl_coefficient,
|
||||
new_kl_coefficient,
|
||||
self.networks['actor'].online_network.output_heads[0].kl_coefficient_ph)
|
||||
|
||||
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
||||
|
||||
def post_training_commands(self):
|
||||
if self.ap.algorithm.use_kl_regularization:
|
||||
self.update_kl_coefficient()
|
||||
|
||||
# clean memory
|
||||
self.call_memory('clean')
|
||||
|
||||
def train(self):
|
||||
loss = 0
|
||||
if self._should_train(wait_for_full_episode=True):
|
||||
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
|
||||
self.networks['actor'].sync()
|
||||
self.networks['critic'].sync()
|
||||
|
||||
dataset = self.memory.transitions
|
||||
|
||||
self.fill_advantages(dataset)
|
||||
|
||||
# take only the requested number of steps
|
||||
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
|
||||
|
||||
value_loss = self.train_value_network(dataset, 1)
|
||||
policy_loss = self.train_policy_network(dataset, 10)
|
||||
|
||||
self.value_loss.add_sample(value_loss)
|
||||
self.policy_loss.add_sample(policy_loss)
|
||||
|
||||
self.post_training_commands()
|
||||
self.training_iteration += 1
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(value_loss, policy_loss)
|
||||
|
||||
def get_prediction(self, states):
|
||||
tf_input_state = self.prepare_batch_for_inference(states, "actor")
|
||||
return self.networks['actor'].online_network.predict(tf_input_state)
|
||||
112
rl_coach/agents/qr_dqn_agent.py
Normal file
112
rl_coach/agents/qr_dqn_agent.py
Normal file
@@ -0,0 +1,112 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.architectures.tensorflow_components.heads.quantile_regression_q_head import QuantileRegressionQHeadParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters, DQNAlgorithmParameters
|
||||
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from rl_coach.core_types import StateType
|
||||
|
||||
|
||||
class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.heads_parameters = [QuantileRegressionQHeadParameters()]
|
||||
self.learning_rate = 0.00005
|
||||
self.optimizer_epsilon = 0.01 / 32
|
||||
|
||||
|
||||
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.atoms = 200
|
||||
self.huber_loss_interval = 1 # called k in the paper
|
||||
|
||||
|
||||
class QuantileRegressionDQNAgentParameters(DQNAgentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.algorithm = QuantileRegressionDQNAlgorithmParameters()
|
||||
self.network_wrappers = {"main": QuantileRegressionDQNNetworkParameters()}
|
||||
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
|
||||
self.exploration.evaluation_epsilon = 0.001
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.qr_dqn_agent:QuantileRegressionDQNAgent'
|
||||
|
||||
|
||||
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
|
||||
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.quantile_probabilities = np.ones(self.ap.algorithm.atoms) / float(self.ap.algorithm.atoms)
|
||||
|
||||
def get_q_values(self, quantile_values):
|
||||
return np.dot(quantile_values, self.quantile_probabilities)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
quantile_values = self.get_prediction(states)
|
||||
actions_q_values = self.get_q_values(quantile_values)
|
||||
else:
|
||||
actions_q_values = None
|
||||
return actions_q_values
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
|
||||
|
||||
# get the quantiles of the next states and current states
|
||||
next_state_quantiles, current_quantiles = self.networks['main'].parallel_prediction([
|
||||
(self.networks['main'].target_network, batch.next_states(network_keys)),
|
||||
(self.networks['main'].online_network, batch.states(network_keys))
|
||||
])
|
||||
|
||||
# get the optimal actions to take for the next states
|
||||
target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1)
|
||||
|
||||
# calculate the Bellman update
|
||||
batch_idx = list(range(self.ap.network_wrappers['main'].batch_size))
|
||||
|
||||
TD_targets = batch.rewards(True) + (1.0 - batch.game_overs(True)) * self.ap.algorithm.discount \
|
||||
* next_state_quantiles[batch_idx, target_actions]
|
||||
|
||||
# get the locations of the selected actions within the batch for indexing purposes
|
||||
actions_locations = [[b, a] for b, a in zip(batch_idx, batch.actions())]
|
||||
|
||||
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
|
||||
cumulative_probabilities = np.array(range(self.ap.algorithm.atoms + 1)) / float(self.ap.algorithm.atoms) # tau_i
|
||||
quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1]) # tau^hat_i
|
||||
quantile_midpoints = np.tile(quantile_midpoints, (self.ap.network_wrappers['main'].batch_size, 1))
|
||||
sorted_quantiles = np.argsort(current_quantiles[batch_idx, batch.actions()])
|
||||
for idx in range(self.ap.network_wrappers['main'].batch_size):
|
||||
quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]]
|
||||
|
||||
# train
|
||||
result = self.networks['main'].train_and_sync_networks({
|
||||
**batch.states(network_keys),
|
||||
'output_0_0': actions_locations,
|
||||
'output_0_1': quantile_midpoints,
|
||||
}, TD_targets)
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
|
||||
return total_loss, losses, unclipped_grads
|
||||
|
||||
98
rl_coach/agents/value_optimization_agent.py
Normal file
98
rl_coach/agents/value_optimization_agent.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
|
||||
from rl_coach.spaces import DiscreteActionSpace
|
||||
|
||||
from rl_coach.agents.agent import Agent
|
||||
from rl_coach.core_types import ActionInfo, StateType
|
||||
|
||||
|
||||
## This is an abstract agent - there is no learn_from_batch method ##
|
||||
|
||||
|
||||
class ValueOptimizationAgent(Agent):
|
||||
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
|
||||
super().__init__(agent_parameters, parent)
|
||||
self.q_values = self.register_signal("Q")
|
||||
self.q_value_for_action = {}
|
||||
|
||||
def init_environment_dependent_modules(self):
|
||||
super().init_environment_dependent_modules()
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
for i in range(len(self.spaces.action.actions)):
|
||||
self.q_value_for_action[i] = self.register_signal("Q for action {}".format(i),
|
||||
dump_one_value_per_episode=False,
|
||||
dump_one_value_per_step=True)
|
||||
|
||||
# Algorithms for which q_values are calculated from predictions will override this function
|
||||
def get_all_q_values_for_states(self, states: StateType):
|
||||
if self.exploration_policy.requires_action_values():
|
||||
actions_q_values = self.get_prediction(states)
|
||||
else:
|
||||
actions_q_values = None
|
||||
return actions_q_values
|
||||
|
||||
def get_prediction(self, states):
|
||||
return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'))
|
||||
|
||||
def update_transition_priorities_and_get_weights(self, TD_errors, batch):
|
||||
# update errors in prioritized replay buffer
|
||||
importance_weights = None
|
||||
if isinstance(self.memory, PrioritizedExperienceReplay):
|
||||
self.call_memory('update_priorities', (batch.info('idx'), TD_errors))
|
||||
importance_weights = batch.info('weight')
|
||||
return importance_weights
|
||||
|
||||
def _validate_action(self, policy, action):
|
||||
if np.array(action).shape != ():
|
||||
raise ValueError((
|
||||
'The exploration_policy {} returned a vector of actions '
|
||||
'instead of a single action. ValueOptimizationAgents '
|
||||
'require exploration policies which return a single action.'
|
||||
).format(policy.__class__.__name__))
|
||||
|
||||
def choose_action(self, curr_state):
|
||||
actions_q_values = self.get_all_q_values_for_states(curr_state)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
action = self.exploration_policy.get_action(actions_q_values)
|
||||
self._validate_action(self.exploration_policy, action)
|
||||
|
||||
if actions_q_values is not None:
|
||||
# this is for bootstrapped dqn
|
||||
if type(actions_q_values) == list and len(actions_q_values) > 0:
|
||||
actions_q_values = self.exploration_policy.last_action_values
|
||||
actions_q_values = actions_q_values.squeeze()
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(actions_q_values)
|
||||
for i, q_value in enumerate(actions_q_values):
|
||||
self.q_value_for_action[i].add_sample(q_value)
|
||||
|
||||
action_info = ActionInfo(action=action,
|
||||
action_value=actions_q_values[action],
|
||||
max_action_value=np.max(actions_q_values))
|
||||
else:
|
||||
action_info = ActionInfo(action=action)
|
||||
|
||||
return action_info
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
raise NotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")
|
||||
15
rl_coach/architectures/__init__.py
Normal file
15
rl_coach/architectures/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
71
rl_coach/architectures/architecture.py
Normal file
71
rl_coach/architectures/architecture.py
Normal file
@@ -0,0 +1,71 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class Architecture(object):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= ""):
|
||||
"""
|
||||
:param agent_parameters: the agent parameters
|
||||
:param spaces: the spaces (observation, action, etc.) definition of the agent
|
||||
:param name: the name of the network
|
||||
"""
|
||||
# spaces
|
||||
self.spaces = spaces
|
||||
|
||||
self.name = name
|
||||
self.network_wrapper_name = self.name.split('/')[0] # the name can be main/online and the network_wrapper_name will be main
|
||||
self.full_name = "{}/{}".format(agent_parameters.full_name_id, name)
|
||||
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
|
||||
self.batch_size = self.network_parameters.batch_size
|
||||
self.learning_rate = self.network_parameters.learning_rate
|
||||
self.optimizer = None
|
||||
self.ap = agent_parameters
|
||||
|
||||
def get_model(self):
|
||||
pass
|
||||
|
||||
def predict(self, inputs):
|
||||
pass
|
||||
|
||||
def train_on_batch(self, inputs, targets):
|
||||
pass
|
||||
|
||||
def get_weights(self):
|
||||
pass
|
||||
|
||||
def set_weights(self, weights, rate=1.0):
|
||||
pass
|
||||
|
||||
def reset_accumulated_gradients(self):
|
||||
pass
|
||||
|
||||
def accumulate_gradients(self, inputs, targets):
|
||||
pass
|
||||
|
||||
def apply_and_reset_gradients(self, gradients):
|
||||
pass
|
||||
|
||||
def apply_gradients(self, gradients):
|
||||
pass
|
||||
|
||||
def get_variable_value(self, variable):
|
||||
pass
|
||||
|
||||
def set_variable_value(self, assign_op, value, placeholder=None):
|
||||
pass
|
||||
210
rl_coach/architectures/network_wrapper.py
Normal file
210
rl_coach/architectures/network_wrapper.py
Normal file
@@ -0,0 +1,210 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Tuple
|
||||
|
||||
from rl_coach.base_parameters import Frameworks, AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.logger import failed_imports
|
||||
|
||||
try:
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork
|
||||
except ImportError:
|
||||
failed_imports.append("TensorFlow")
|
||||
|
||||
|
||||
class NetworkWrapper(object):
|
||||
"""
|
||||
Contains multiple networks and managers syncing and gradient updates
|
||||
between them.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
|
||||
spaces: SpacesDefinition, replicated_device=None, worker_device=None):
|
||||
self.ap = agent_parameters
|
||||
self.network_parameters = self.ap.network_wrappers[name]
|
||||
self.has_target = has_target
|
||||
self.has_global = has_global
|
||||
self.name = name
|
||||
self.sess = None
|
||||
|
||||
if self.network_parameters.framework == Frameworks.tensorflow:
|
||||
general_network = GeneralTensorFlowNetwork
|
||||
else:
|
||||
raise Exception("{} Framework is not supported"
|
||||
.format(Frameworks().to_string(self.network_parameters.framework)))
|
||||
|
||||
with tf.variable_scope("{}/{}".format(self.ap.full_name_id, name)):
|
||||
|
||||
# Global network - the main network shared between threads
|
||||
self.global_network = None
|
||||
if self.has_global:
|
||||
# we assign the parameters of this network on the parameters server
|
||||
with tf.device(replicated_device):
|
||||
self.global_network = general_network(agent_parameters=agent_parameters,
|
||||
name='{}/global'.format(name),
|
||||
global_network=None,
|
||||
network_is_local=False,
|
||||
spaces=spaces,
|
||||
network_is_trainable=True)
|
||||
|
||||
# Online network - local copy of the main network used for playing
|
||||
self.online_network = None
|
||||
with tf.device(worker_device):
|
||||
self.online_network = general_network(agent_parameters=agent_parameters,
|
||||
name='{}/online'.format(name),
|
||||
global_network=self.global_network,
|
||||
network_is_local=True,
|
||||
spaces=spaces,
|
||||
network_is_trainable=True)
|
||||
|
||||
# Target network - a local, slow updating network used for stabilizing the learning
|
||||
self.target_network = None
|
||||
if self.has_target:
|
||||
with tf.device(worker_device):
|
||||
self.target_network = general_network(agent_parameters=agent_parameters,
|
||||
name='{}/target'.format(name),
|
||||
global_network=self.global_network,
|
||||
network_is_local=True,
|
||||
spaces=spaces,
|
||||
network_is_trainable=False)
|
||||
|
||||
def sync(self):
|
||||
"""
|
||||
Initializes the weights of the networks to match each other
|
||||
:return:
|
||||
"""
|
||||
self.update_online_network()
|
||||
self.update_target_network()
|
||||
|
||||
def update_target_network(self, rate=1.0):
|
||||
"""
|
||||
Copy weights: online network >>> target network
|
||||
:param rate: the rate of copying the weights - 1 for copying exactly
|
||||
"""
|
||||
if self.target_network:
|
||||
self.target_network.set_weights(self.online_network.get_weights(), rate)
|
||||
|
||||
def update_online_network(self, rate=1.0):
|
||||
"""
|
||||
Copy weights: global network >>> online network
|
||||
:param rate: the rate of copying the weights - 1 for copying exactly
|
||||
"""
|
||||
if self.global_network:
|
||||
self.online_network.set_weights(self.global_network.get_weights(), rate)
|
||||
|
||||
def apply_gradients_to_global_network(self, gradients=None):
|
||||
"""
|
||||
Apply gradients from the online network on the global network
|
||||
:param gradients: optional gradients that will be used instead of teh accumulated gradients
|
||||
:return:
|
||||
"""
|
||||
if gradients is None:
|
||||
gradients = self.online_network.accumulated_gradients
|
||||
if self.network_parameters.shared_optimizer:
|
||||
self.global_network.apply_gradients(gradients)
|
||||
else:
|
||||
self.online_network.apply_gradients(gradients)
|
||||
|
||||
def apply_gradients_to_online_network(self, gradients=None):
|
||||
"""
|
||||
Apply gradients from the online network on itself
|
||||
:return:
|
||||
"""
|
||||
if gradients is None:
|
||||
gradients = self.online_network.accumulated_gradients
|
||||
self.online_network.apply_gradients(gradients)
|
||||
|
||||
def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
|
||||
"""
|
||||
A generic training function that enables multi-threading training using a global network if necessary.
|
||||
:param inputs: The inputs for the network.
|
||||
:param targets: The targets corresponding to the given inputs
|
||||
:param additional_fetches: Any additional tensor the user wants to fetch
|
||||
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won't be scaled
|
||||
:return: The loss of the training iteration
|
||||
"""
|
||||
result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
|
||||
importance_weights=importance_weights, no_accumulation=True)
|
||||
self.apply_gradients_and_sync_networks(reset_gradients=False)
|
||||
return result
|
||||
|
||||
def apply_gradients_and_sync_networks(self, reset_gradients=True):
|
||||
"""
|
||||
Applies the gradients accumulated in the online network to the global network or to itself and syncs the
|
||||
networks if necessary
|
||||
:param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
|
||||
the network. this is useful when the accumulated gradients are overwritten instead
|
||||
if accumulated by the accumulate_gradients function. this allows reducing time
|
||||
complexity for this function by around 10%
|
||||
"""
|
||||
if self.global_network:
|
||||
self.apply_gradients_to_global_network()
|
||||
if reset_gradients:
|
||||
self.online_network.reset_accumulated_gradients()
|
||||
self.update_online_network()
|
||||
else:
|
||||
if reset_gradients:
|
||||
self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients)
|
||||
else:
|
||||
self.online_network.apply_gradients(self.online_network.accumulated_gradients)
|
||||
|
||||
def parallel_prediction(self, network_input_tuples: List[Tuple]):
|
||||
"""
|
||||
Run several network prediction in parallel. Currently this only supports running each of the network once.
|
||||
:param network_input_tuples: a list of tuples where the first element is the network (online_network,
|
||||
target_network or global_network) and the second element is the inputs
|
||||
:return: the outputs of all the networks in the same order as the inputs were given
|
||||
"""
|
||||
feed_dict = {}
|
||||
fetches = []
|
||||
|
||||
for idx, (network, input) in enumerate(network_input_tuples):
|
||||
feed_dict.update(network.create_feed_dict(input))
|
||||
fetches += network.outputs
|
||||
|
||||
outputs = self.sess.run(fetches, feed_dict)
|
||||
|
||||
return outputs
|
||||
|
||||
def get_local_variables(self):
|
||||
"""
|
||||
Get all the variables that are local to the thread
|
||||
:return: a list of all the variables that are local to the thread
|
||||
"""
|
||||
local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
|
||||
if self.has_target:
|
||||
local_variables += [v for v in tf.local_variables() if self.target_network.name in v.name]
|
||||
return local_variables
|
||||
|
||||
def get_global_variables(self):
|
||||
"""
|
||||
Get all the variables that are shared between threads
|
||||
:return: a list of all the variables that are shared between threads
|
||||
"""
|
||||
global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
|
||||
return global_variables
|
||||
|
||||
def set_session(self, sess):
|
||||
self.sess = sess
|
||||
self.online_network.set_session(sess)
|
||||
if self.global_network:
|
||||
self.global_network.set_session(sess)
|
||||
if self.target_network:
|
||||
self.target_network.set_session(sess)
|
||||
|
||||
664
rl_coach/architectures/tensorflow_components/architecture.py
Normal file
664
rl_coach/architectures/tensorflow_components/architecture.py
Normal file
@@ -0,0 +1,664 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import force_list, squeeze_list
|
||||
|
||||
from rl_coach.architectures.architecture import Architecture
|
||||
from rl_coach.core_types import GradientClippingMethod
|
||||
|
||||
|
||||
def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
|
||||
layers = [input_layer]
|
||||
|
||||
# batchnorm
|
||||
if batchnorm:
|
||||
layers.append(
|
||||
tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# activation
|
||||
if activation_function:
|
||||
layers.append(
|
||||
activation_function(layers[-1], name="activation{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# dropout
|
||||
if dropout:
|
||||
layers.append(
|
||||
tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
|
||||
)
|
||||
|
||||
# remove the input layer from the layers list
|
||||
del layers[0]
|
||||
|
||||
return layers
|
||||
|
||||
|
||||
class Conv2d(object):
|
||||
def __init__(self, params: List):
|
||||
"""
|
||||
:param params: list of [num_filters, kernel_size, strides]
|
||||
"""
|
||||
self.params = params
|
||||
|
||||
def __call__(self, input_layer, name: str):
|
||||
"""
|
||||
returns a tensorflow conv2d layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: conv2d layer
|
||||
"""
|
||||
return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
|
||||
data_format='channels_last', name=name)
|
||||
|
||||
|
||||
class Dense(object):
|
||||
def __init__(self, params: List):
|
||||
"""
|
||||
:param params: list of [num_output_neurons]
|
||||
"""
|
||||
self.params = params
|
||||
|
||||
def __call__(self, input_layer, name: str):
|
||||
"""
|
||||
returns a tensorflow dense layer
|
||||
:param input_layer: previous layer
|
||||
:param name: layer name
|
||||
:return: dense layer
|
||||
"""
|
||||
return tf.layers.dense(input_layer, self.params[0], name=name)
|
||||
|
||||
|
||||
def variable_summaries(var):
|
||||
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
|
||||
with tf.name_scope('summaries'):
|
||||
layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
|
||||
|
||||
with tf.name_scope(layer_weight_name):
|
||||
mean = tf.reduce_mean(var)
|
||||
tf.summary.scalar('mean', mean)
|
||||
with tf.name_scope('stddev'):
|
||||
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
|
||||
tf.summary.scalar('stddev', stddev)
|
||||
tf.summary.scalar('max', tf.reduce_max(var))
|
||||
tf.summary.scalar('min', tf.reduce_min(var))
|
||||
tf.summary.histogram('histogram', var)
|
||||
|
||||
|
||||
def local_getter(getter, name, *args, **kwargs):
|
||||
"""
|
||||
This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
|
||||
instead of the global variables collection. The local variables collection will hold variables which are not shared
|
||||
between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
|
||||
these variables), but we can calculate the gradients wrt these variables, and we can update their content.
|
||||
"""
|
||||
kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
|
||||
return getter(name, *args, **kwargs)
|
||||
|
||||
|
||||
class TensorFlowArchitecture(Architecture):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
|
||||
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
|
||||
"""
|
||||
:param agent_parameters: the agent parameters
|
||||
:param spaces: the spaces definition of the agent
|
||||
:param name: the name of the network
|
||||
:param global_network: the global network replica that is shared between all the workers
|
||||
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
|
||||
:param network_is_trainable: is the network trainable (we can apply gradients on it)
|
||||
"""
|
||||
super().__init__(agent_parameters, spaces, name)
|
||||
self.middleware = None
|
||||
self.network_is_local = network_is_local
|
||||
self.global_network = global_network
|
||||
if not self.network_parameters.tensorflow_support:
|
||||
raise ValueError('TensorFlow is not supported for this agent')
|
||||
self.sess = None
|
||||
self.inputs = {}
|
||||
self.outputs = []
|
||||
self.targets = []
|
||||
self.importance_weights = []
|
||||
self.losses = []
|
||||
self.total_loss = None
|
||||
self.trainable_weights = []
|
||||
self.weights_placeholders = []
|
||||
self.shared_accumulated_gradients = []
|
||||
self.curr_rnn_c_in = None
|
||||
self.curr_rnn_h_in = None
|
||||
self.gradients_wrt_inputs = []
|
||||
self.train_writer = None
|
||||
self.accumulated_gradients = None
|
||||
self.network_is_trainable = network_is_trainable
|
||||
|
||||
self.is_chief = self.ap.task_parameters.task_index == 0
|
||||
self.network_is_global = not self.network_is_local and global_network is None
|
||||
self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
|
||||
|
||||
self.optimizer_type = self.network_parameters.optimizer_type
|
||||
if self.ap.task_parameters.seed is not None:
|
||||
tf.set_random_seed(self.ap.task_parameters.seed)
|
||||
with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
|
||||
custom_getter=local_getter if network_is_local and global_network else None):
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# build the network
|
||||
self.get_model()
|
||||
|
||||
# model weights
|
||||
self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
|
||||
|
||||
# create the placeholder for the assigning gradients and some tensorboard summaries for the weights
|
||||
for idx, var in enumerate(self.weights):
|
||||
placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
|
||||
self.weights_placeholders.append(placeholder)
|
||||
if self.ap.visualization.tensorboard:
|
||||
variable_summaries(var)
|
||||
|
||||
# create op for assigning a list of weights to the network weights
|
||||
self.update_weights_from_list = [weights.assign(holder) for holder, weights in
|
||||
zip(self.weights_placeholders, self.weights)]
|
||||
|
||||
# locks for synchronous training
|
||||
if self.network_is_global:
|
||||
self._create_locks_for_synchronous_training()
|
||||
|
||||
# gradients ops
|
||||
self._create_gradient_ops()
|
||||
|
||||
# L2 regularization
|
||||
if self.network_parameters.l2_regularization != 0:
|
||||
self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
|
||||
* self.network_parameters.l2_regularization]
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
|
||||
|
||||
self.inc_step = self.global_step.assign_add(1)
|
||||
|
||||
# reset LSTM hidden cells
|
||||
self.reset_internal_memory()
|
||||
|
||||
if self.ap.visualization.tensorboard:
|
||||
current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
|
||||
scope=tf.contrib.framework.get_name_scope())
|
||||
self.merged = tf.summary.merge(current_scope_summaries)
|
||||
|
||||
# initialize or restore model
|
||||
self.init_op = tf.group(
|
||||
tf.global_variables_initializer(),
|
||||
tf.local_variables_initializer()
|
||||
)
|
||||
|
||||
# set the fetches for training
|
||||
self._set_initial_fetch_list()
|
||||
|
||||
def _set_initial_fetch_list(self):
|
||||
"""
|
||||
Create an initial list of tensors to fetch in each training iteration
|
||||
:return: None
|
||||
"""
|
||||
self.train_fetches = [self.gradients_norm]
|
||||
if self.network_parameters.clip_gradients:
|
||||
self.train_fetches.append(self.clipped_grads)
|
||||
else:
|
||||
self.train_fetches.append(self.tensor_gradients)
|
||||
self.train_fetches += [self.total_loss, self.losses]
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
self.train_fetches.append(self.middleware.state_out)
|
||||
self.additional_fetches_start_idx = len(self.train_fetches)
|
||||
|
||||
def _create_locks_for_synchronous_training(self):
|
||||
"""
|
||||
Create locks for synchronizing the different workers during training
|
||||
:return: None
|
||||
"""
|
||||
self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
self.lock = self.lock_counter.assign_add(1, use_locking=True)
|
||||
self.lock_init = self.lock_counter.assign(0)
|
||||
|
||||
self.release_counter = tf.get_variable("release_counter", [], tf.int32,
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
self.release = self.release_counter.assign_add(1, use_locking=True)
|
||||
self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
|
||||
self.release_init = self.release_counter.assign(0)
|
||||
|
||||
def _create_gradient_ops(self):
|
||||
"""
|
||||
Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
|
||||
:return: None
|
||||
"""
|
||||
|
||||
self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
|
||||
self.gradients_norm = tf.global_norm(self.tensor_gradients)
|
||||
|
||||
# gradient clipping
|
||||
if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
|
||||
self._create_gradient_clipping_ops()
|
||||
|
||||
# when using a shared optimizer, we create accumulators to store gradients from all the workers before
|
||||
# applying them
|
||||
if self.distributed_training:
|
||||
self._create_gradient_accumulators()
|
||||
|
||||
# gradients of the outputs w.r.t. the inputs
|
||||
# at the moment, this is only used by ddpg
|
||||
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
|
||||
self.inputs.items()} for output in self.outputs]
|
||||
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
|
||||
for i in range(len(self.outputs))]
|
||||
self.weighted_gradients = []
|
||||
for i in range(len(self.outputs)):
|
||||
unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
|
||||
# unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
|
||||
# self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
|
||||
# unnormalized_gradients)))
|
||||
self.weighted_gradients.append(unnormalized_gradients)
|
||||
|
||||
# defining the optimization process (for LBFGS we have less control over the optimizer)
|
||||
if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
|
||||
self._create_gradient_applying_ops()
|
||||
|
||||
def _create_gradient_accumulators(self):
|
||||
if self.network_is_global:
|
||||
self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
|
||||
self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
|
||||
zip(self.weights_placeholders, self.shared_accumulated_gradients)]
|
||||
self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
|
||||
self.shared_accumulated_gradients]
|
||||
elif self.network_is_local:
|
||||
self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
|
||||
self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
|
||||
|
||||
def _create_gradient_clipping_ops(self):
|
||||
"""
|
||||
Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
|
||||
:return: None
|
||||
"""
|
||||
if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
|
||||
self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
|
||||
self.network_parameters.clip_gradients)
|
||||
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
|
||||
self.clipped_grads = [tf.clip_by_value(grad,
|
||||
-self.network_parameters.clip_gradients,
|
||||
self.network_parameters.clip_gradients)
|
||||
for grad in self.tensor_gradients]
|
||||
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
|
||||
self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
|
||||
for grad in self.tensor_gradients]
|
||||
|
||||
def _create_gradient_applying_ops(self):
|
||||
"""
|
||||
Create tensorflow ops for applying the gradients to the network weights according to the training scheme
|
||||
(distributed training - local or global network, shared optimizer, etc.)
|
||||
:return: None
|
||||
"""
|
||||
if self.network_is_global and self.network_parameters.shared_optimizer and \
|
||||
not self.network_parameters.async_training:
|
||||
# synchronous training with shared optimizer? -> create an operation for applying the gradients
|
||||
# accumulated in the shared gradients accumulator
|
||||
self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.shared_accumulated_gradients, self.weights),
|
||||
global_step=self.global_step)
|
||||
|
||||
elif self.distributed_training and self.network_is_local:
|
||||
# distributed training but independent optimizer? -> create an operation for applying the gradients
|
||||
# to the global weights
|
||||
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
|
||||
|
||||
elif self.network_is_trainable:
|
||||
# not any of the above but is trainable? -> create an operation for applying the gradients to
|
||||
# this network weights
|
||||
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
|
||||
zip(self.weights_placeholders, self.weights), global_step=self.global_step)
|
||||
|
||||
def set_session(self, sess):
|
||||
self.sess = sess
|
||||
|
||||
task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
|
||||
# initialize the session parameters in single threaded runs. Otherwise, this is done through the
|
||||
# MonitoredSession object in the graph manager
|
||||
if not task_is_distributed:
|
||||
self.sess.run(self.init_op)
|
||||
|
||||
if self.ap.visualization.tensorboard:
|
||||
# Write the merged summaries to the current experiment directory
|
||||
if not task_is_distributed:
|
||||
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
|
||||
self.train_writer.add_graph(self.sess.graph)
|
||||
elif self.network_is_local:
|
||||
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
|
||||
'/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
|
||||
self.train_writer.add_graph(self.sess.graph)
|
||||
|
||||
# wait for all the workers to set their session
|
||||
if not self.network_is_local:
|
||||
self.wait_for_all_workers_barrier()
|
||||
|
||||
def reset_accumulated_gradients(self):
|
||||
"""
|
||||
Reset the gradients accumulation placeholder
|
||||
"""
|
||||
if self.accumulated_gradients is None:
|
||||
self.accumulated_gradients = self.sess.run(self.weights)
|
||||
|
||||
for ix, grad in enumerate(self.accumulated_gradients):
|
||||
self.accumulated_gradients[ix] = grad * 0
|
||||
|
||||
def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
|
||||
no_accumulation=False):
|
||||
"""
|
||||
Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
|
||||
placeholders
|
||||
:param additional_fetches: Optional tensors to fetch during gradients calculation
|
||||
:param inputs: The input batch for the network
|
||||
:param targets: The targets corresponding to the input batch
|
||||
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won't be scaled
|
||||
:param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
|
||||
replaced by the newely calculated gradients instead of accumulating the new gradients.
|
||||
This can speed up the function runtime by around 10%.
|
||||
:return: A list containing the total loss and the individual network heads losses
|
||||
"""
|
||||
|
||||
if self.accumulated_gradients is None:
|
||||
self.reset_accumulated_gradients()
|
||||
|
||||
# feed inputs
|
||||
if additional_fetches is None:
|
||||
additional_fetches = []
|
||||
feed_dict = self.create_feed_dict(inputs)
|
||||
|
||||
# feed targets
|
||||
targets = force_list(targets)
|
||||
for placeholder_idx, target in enumerate(targets):
|
||||
feed_dict[self.targets[placeholder_idx]] = target
|
||||
|
||||
# feed importance weights
|
||||
importance_weights = force_list(importance_weights)
|
||||
for placeholder_idx, target_ph in enumerate(targets):
|
||||
if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
|
||||
importance_weight = np.ones(target_ph.shape[0])
|
||||
else:
|
||||
importance_weight = importance_weights[placeholder_idx]
|
||||
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
|
||||
|
||||
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
|
||||
|
||||
if self.optimizer_type != 'LBFGS':
|
||||
|
||||
# feed the lstm state if necessary
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
# we can't always assume that we are starting from scratch here can we?
|
||||
feed_dict[self.middleware.c_in] = self.middleware.c_init
|
||||
feed_dict[self.middleware.h_in] = self.middleware.h_init
|
||||
|
||||
fetches = self.train_fetches + additional_fetches
|
||||
if self.ap.visualization.tensorboard:
|
||||
fetches += [self.merged]
|
||||
|
||||
# get grads
|
||||
result = self.sess.run(fetches, feed_dict=feed_dict)
|
||||
if hasattr(self, 'train_writer') and self.train_writer is not None:
|
||||
self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
|
||||
|
||||
# extract the fetches
|
||||
norm_unclipped_grads, grads, total_loss, losses = result[:4]
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
|
||||
fetched_tensors = []
|
||||
if len(additional_fetches) > 0:
|
||||
fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
|
||||
len(additional_fetches)]
|
||||
|
||||
# accumulate the gradients
|
||||
for idx, grad in enumerate(grads):
|
||||
if no_accumulation:
|
||||
self.accumulated_gradients[idx] = grad
|
||||
else:
|
||||
self.accumulated_gradients[idx] += grad
|
||||
|
||||
return total_loss, losses, norm_unclipped_grads, fetched_tensors
|
||||
|
||||
else:
|
||||
self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
|
||||
|
||||
return [0]
|
||||
|
||||
def create_feed_dict(self, inputs):
|
||||
feed_dict = {}
|
||||
for input_name, input_value in inputs.items():
|
||||
if isinstance(input_name, str):
|
||||
if input_name not in self.inputs:
|
||||
raise ValueError((
|
||||
'input name {input_name} was provided to create a feed '
|
||||
'dictionary, but there is no placeholder with that name. '
|
||||
'placeholder names available include: {placeholder_names}'
|
||||
).format(
|
||||
input_name=input_name,
|
||||
placeholder_names=', '.join(self.inputs.keys())
|
||||
))
|
||||
|
||||
feed_dict[self.inputs[input_name]] = input_value
|
||||
elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
|
||||
feed_dict[input_name] = input_value
|
||||
else:
|
||||
raise ValueError((
|
||||
'input dictionary expects strings or placeholders as keys, '
|
||||
'but found key {key} of type {type}'
|
||||
).format(
|
||||
key=input_name,
|
||||
type=type(input_name),
|
||||
))
|
||||
|
||||
return feed_dict
|
||||
|
||||
def apply_and_reset_gradients(self, gradients, scaler=1.):
|
||||
"""
|
||||
Applies the given gradients to the network weights and resets the accumulation placeholder
|
||||
:param gradients: The gradients to use for the update
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
"""
|
||||
self.apply_gradients(gradients, scaler)
|
||||
self.reset_accumulated_gradients()
|
||||
|
||||
def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
|
||||
"""
|
||||
Waits for all the workers to lock a certain lock and then continues
|
||||
:param lock: the name of the lock to use
|
||||
:param include_only_training_workers: wait only for training workers or for all the workers?
|
||||
:return: None
|
||||
"""
|
||||
if include_only_training_workers:
|
||||
num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
|
||||
else:
|
||||
num_workers_to_wait_for = self.ap.task_parameters.num_tasks
|
||||
|
||||
# lock
|
||||
if hasattr(self, '{}_counter'.format(lock)):
|
||||
self.sess.run(getattr(self, lock))
|
||||
while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
|
||||
time.sleep(0.00001)
|
||||
# self.sess.run(getattr(self, '{}_init'.format(lock)))
|
||||
else:
|
||||
raise ValueError("no counter was defined for the lock {}".format(lock))
|
||||
|
||||
def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
|
||||
"""
|
||||
A barrier that allows waiting for all the workers to finish a certain block of commands
|
||||
:param include_only_training_workers: wait only for training workers or for all the workers?
|
||||
:return: None
|
||||
"""
|
||||
self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
|
||||
self.sess.run(self.lock_init)
|
||||
|
||||
# we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
|
||||
# and then was able to first increase the lock again by one, only to have a late worker to reset it again.
|
||||
# so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
|
||||
|
||||
self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
|
||||
self.sess.run(self.release_init)
|
||||
|
||||
def apply_gradients(self, gradients, scaler=1.):
|
||||
"""
|
||||
Applies the given gradients to the network weights
|
||||
:param gradients: The gradients to use for the update
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them.
|
||||
The gradients will be MULTIPLIED by this factor
|
||||
"""
|
||||
if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
|
||||
if hasattr(self, 'global_step') and not self.network_is_local:
|
||||
self.sess.run(self.inc_step)
|
||||
|
||||
if self.optimizer_type != 'LBFGS':
|
||||
|
||||
if self.distributed_training and not self.network_parameters.async_training:
|
||||
# rescale the gradients so that they average out with the gradients from the other workers
|
||||
if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
|
||||
scaler /= float(self.ap.task_parameters.num_training_tasks)
|
||||
|
||||
# rescale the gradients
|
||||
if scaler != 1.:
|
||||
for gradient in gradients:
|
||||
gradient *= scaler
|
||||
|
||||
# apply the gradients
|
||||
feed_dict = dict(zip(self.weights_placeholders, gradients))
|
||||
if self.distributed_training and self.network_parameters.shared_optimizer \
|
||||
and not self.network_parameters.async_training:
|
||||
# synchronous distributed training with shared optimizer:
|
||||
# - each worker adds its gradients to the shared gradients accumulators
|
||||
# - we wait for all the workers to add their gradients
|
||||
# - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
|
||||
|
||||
self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
|
||||
|
||||
self.wait_for_all_workers_barrier(include_only_training_workers=True)
|
||||
|
||||
if self.is_chief:
|
||||
self.sess.run(self.update_weights_from_shared_gradients)
|
||||
self.sess.run(self.init_shared_accumulated_gradients)
|
||||
else:
|
||||
# async distributed training / distributed training with independent optimizer
|
||||
# / non-distributed training - just apply the gradients
|
||||
feed_dict = dict(zip(self.weights_placeholders, gradients))
|
||||
self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
|
||||
|
||||
# release barrier
|
||||
if self.distributed_training and not self.network_parameters.async_training:
|
||||
self.wait_for_all_workers_barrier(include_only_training_workers=True)
|
||||
|
||||
def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
|
||||
"""
|
||||
Run a forward pass of the network using the given input
|
||||
:param inputs: The input for the network
|
||||
:param outputs: The output for the network, defaults to self.outputs
|
||||
:param squeeze_output: call squeeze_list on output
|
||||
:param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
|
||||
:return: The network output
|
||||
|
||||
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
|
||||
"""
|
||||
feed_dict = self.create_feed_dict(inputs)
|
||||
if initial_feed_dict:
|
||||
feed_dict.update(initial_feed_dict)
|
||||
if outputs is None:
|
||||
outputs = self.outputs
|
||||
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
|
||||
feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
|
||||
|
||||
output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
|
||||
feed_dict=feed_dict)
|
||||
else:
|
||||
output = self.sess.run(outputs, feed_dict)
|
||||
|
||||
if squeeze_output:
|
||||
output = squeeze_list(output)
|
||||
return output
|
||||
|
||||
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
|
||||
"""
|
||||
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
|
||||
:param additional_fetches: Optional tensors to fetch during the training process
|
||||
:param inputs: The input for the network
|
||||
:param targets: The targets corresponding to the input batch
|
||||
:param scaler: A scaling factor that allows rescaling the gradients before applying them
|
||||
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
|
||||
error of this sample. If it is not given, the samples losses won't be scaled
|
||||
:return: The loss of the network
|
||||
"""
|
||||
if additional_fetches is None:
|
||||
additional_fetches = []
|
||||
force_list(additional_fetches)
|
||||
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
|
||||
importance_weights=importance_weights)
|
||||
self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
|
||||
return loss
|
||||
|
||||
def get_weights(self):
|
||||
"""
|
||||
:return: a list of tensors containing the network weights for each layer
|
||||
"""
|
||||
return self.weights
|
||||
|
||||
def set_weights(self, weights, new_rate=1.0):
|
||||
"""
|
||||
Sets the network weights from the given list of weights tensors
|
||||
"""
|
||||
feed_dict = {}
|
||||
old_weights, new_weights = self.sess.run([self.get_weights(), weights])
|
||||
for placeholder_idx, new_weight in enumerate(new_weights):
|
||||
feed_dict[self.weights_placeholders[placeholder_idx]]\
|
||||
= new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
|
||||
self.sess.run(self.update_weights_from_list, feed_dict)
|
||||
|
||||
def get_variable_value(self, variable):
|
||||
"""
|
||||
Get the value of a variable from the graph
|
||||
:param variable: the variable
|
||||
:return: the value of the variable
|
||||
"""
|
||||
return self.sess.run(variable)
|
||||
|
||||
def set_variable_value(self, assign_op, value, placeholder=None):
|
||||
"""
|
||||
Updates the value of a variable.
|
||||
This requires having an assign operation for the variable, and a placeholder which will provide the value
|
||||
:param assign_op: an assign operation for the variable
|
||||
:param value: a value to set the variable to
|
||||
:param placeholder: a placeholder to hold the given value for injecting it into the variable
|
||||
"""
|
||||
self.sess.run(assign_op, feed_dict={placeholder: value})
|
||||
|
||||
def reset_internal_memory(self):
|
||||
"""
|
||||
Reset any internal memory used by the network. For example, an LSTM internal state
|
||||
:return: None
|
||||
"""
|
||||
# initialize LSTM hidden states
|
||||
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
|
||||
self.curr_rnn_c_in = self.middleware.c_init
|
||||
self.curr_rnn_h_in = self.middleware.h_init
|
||||
@@ -0,0 +1,102 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
|
||||
"""
|
||||
Creates a ClusterSpec object representing the cluster.
|
||||
:param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
|
||||
:param workers: comma-separated list of hostname:port pairs to which the workers are assigned
|
||||
:return: a ClusterSpec object representing the cluster
|
||||
"""
|
||||
# extract the parameter servers and workers from the given strings
|
||||
ps_hosts = parameters_server.split(",")
|
||||
worker_hosts = workers.split(",")
|
||||
|
||||
# Create a cluster spec from the parameter server and worker hosts
|
||||
cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
|
||||
|
||||
return cluster_spec
|
||||
|
||||
|
||||
def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
|
||||
"""
|
||||
Create and start a parameter server
|
||||
:param cluster_spec: the ClusterSpec object representing the cluster
|
||||
:param config: the tensorflow config to use
|
||||
:return: None
|
||||
"""
|
||||
# create a server object for the parameter server
|
||||
server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
|
||||
|
||||
# wait for the server to finish
|
||||
server.join()
|
||||
|
||||
|
||||
def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
|
||||
use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
|
||||
"""
|
||||
Creates a worker server and a device setter used to assign the workers operations to
|
||||
:param cluster_spec: a ClusterSpec object representing the cluster
|
||||
:param task_index: the index of the worker task
|
||||
:param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
|
||||
:param config: the tensorflow config to use
|
||||
:return: the target string for the tf.Session and the worker device setter object
|
||||
"""
|
||||
# Create and start a worker
|
||||
server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
|
||||
|
||||
# Assign ops to the local worker
|
||||
worker_device = "/job:worker/task:{}".format(task_index)
|
||||
if use_cpu:
|
||||
worker_device += "/cpu:0"
|
||||
else:
|
||||
worker_device += "/device:GPU:0"
|
||||
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
|
||||
|
||||
return server.target, device
|
||||
|
||||
|
||||
def create_monitored_session(target: tf.train.Server, task_index: int,
|
||||
checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
|
||||
"""
|
||||
Create a monitored session for the worker
|
||||
:param target: the target string for the tf.Session
|
||||
:param task_index: the task index of the worker
|
||||
:param checkpoint_dir: a directory path where the checkpoints will be stored
|
||||
:param save_checkpoint_secs: number of seconds between checkpoints storing
|
||||
:param config: the tensorflow configuration (optional)
|
||||
:return: the session to use for the run
|
||||
"""
|
||||
# we chose the first task to be the chief
|
||||
is_chief = task_index == 0
|
||||
|
||||
# Create the monitored session
|
||||
sess = tf.train.MonitoredTrainingSession(
|
||||
master=target,
|
||||
is_chief=is_chief,
|
||||
hooks=[],
|
||||
checkpoint_dir=checkpoint_dir,
|
||||
save_checkpoint_secs=save_checkpoint_secs,
|
||||
config=config
|
||||
)
|
||||
|
||||
return sess
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
from rl_coach.core_types import InputEmbedding
|
||||
|
||||
|
||||
class InputEmbedder(object):
|
||||
"""
|
||||
An input embedder is the first part of the network, which takes the input from the state and produces a vector
|
||||
embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
|
||||
can be multiple embedders in a single network
|
||||
"""
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
|
||||
self.name = name
|
||||
self.input_size = input_size
|
||||
self.activation_function = activation_function
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.dropout_rate = 0
|
||||
self.input = None
|
||||
self.output = None
|
||||
self.scheme = scheme
|
||||
self.return_type = InputEmbedding
|
||||
self.layers = []
|
||||
self.input_rescaling = input_rescaling
|
||||
self.input_offset = input_offset
|
||||
self.input_clipping = input_clipping
|
||||
|
||||
def __call__(self, prev_input_placeholder=None):
|
||||
with tf.variable_scope(self.get_name()):
|
||||
if prev_input_placeholder is None:
|
||||
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
|
||||
else:
|
||||
self.input = prev_input_placeholder
|
||||
self._build_module()
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
|
||||
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
|
||||
# input to the network to be float, which is 4x more expensive in memory.
|
||||
# thus causing each saved transition in the memory to also be 4x more pricier.
|
||||
|
||||
input_layer = self.input / self.input_rescaling
|
||||
input_layer -= self.input_offset
|
||||
# clip input using te given range
|
||||
if self.input_clipping is not None:
|
||||
input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
|
||||
|
||||
self.layers.append(input_layer)
|
||||
|
||||
# layers order is conv -> batchnorm -> activation -> dropout
|
||||
if isinstance(self.scheme, EmbedderScheme):
|
||||
layers_params = self.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
self.output = tf.contrib.layers.flatten(self.layers[-1])
|
||||
|
||||
@property
|
||||
def input_size(self) -> List[int]:
|
||||
return self._input_size
|
||||
|
||||
@input_size.setter
|
||||
def input_size(self, value: Union[int, List[int]]):
|
||||
if isinstance(value, np.ndarray) or isinstance(value, tuple):
|
||||
value = list(value)
|
||||
elif isinstance(value, int):
|
||||
value = [value]
|
||||
if not isinstance(value, list):
|
||||
raise ValueError((
|
||||
'input_size expected to be a list, found {value} which has type {type}'
|
||||
).format(value=value, type=type(value)))
|
||||
self._input_size = value
|
||||
|
||||
@property
|
||||
def schemes(self):
|
||||
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
|
||||
"configurations.")
|
||||
|
||||
def get_name(self):
|
||||
return self.name
|
||||
@@ -0,0 +1,74 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Conv2d
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
|
||||
from rl_coach.core_types import InputImageEmbedding
|
||||
|
||||
|
||||
class ImageEmbedder(InputEmbedder):
|
||||
"""
|
||||
An input embedder that performs convolutions on the input and then flattens the result.
|
||||
The embedder is intended for image like inputs, where the channels are expected to be the last axis.
|
||||
The embedder also allows custom rescaling of the input prior to the neural network.
|
||||
"""
|
||||
schemes = {
|
||||
EmbedderScheme.Empty:
|
||||
[],
|
||||
|
||||
EmbedderScheme.Shallow:
|
||||
[
|
||||
Conv2d([32, 3, 1])
|
||||
],
|
||||
|
||||
# atari dqn
|
||||
EmbedderScheme.Medium:
|
||||
[
|
||||
Conv2d([32, 8, 4]),
|
||||
Conv2d([64, 4, 2]),
|
||||
Conv2d([64, 3, 1])
|
||||
],
|
||||
|
||||
# carla
|
||||
EmbedderScheme.Deep: \
|
||||
[
|
||||
Conv2d([32, 5, 2]),
|
||||
Conv2d([32, 3, 1]),
|
||||
Conv2d([64, 3, 2]),
|
||||
Conv2d([64, 3, 1]),
|
||||
Conv2d([128, 3, 2]),
|
||||
Conv2d([128, 3, 1]),
|
||||
Conv2d([256, 3, 2]),
|
||||
Conv2d([256, 3, 1])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
|
||||
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
|
||||
input_offset, input_clipping)
|
||||
self.return_type = InputImageEmbedding
|
||||
if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
|
||||
raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
|
||||
.format(input_size))
|
||||
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import EmbedderScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
|
||||
from rl_coach.core_types import InputVectorEmbedding
|
||||
|
||||
|
||||
class VectorEmbedder(InputEmbedder):
|
||||
"""
|
||||
An input embedder that is intended for inputs that can be represented as vectors.
|
||||
The embedder flattens the input, applies several dense layers to it and returns the output.
|
||||
"""
|
||||
schemes = {
|
||||
EmbedderScheme.Empty:
|
||||
[],
|
||||
|
||||
EmbedderScheme.Shallow:
|
||||
[
|
||||
Dense([128])
|
||||
],
|
||||
|
||||
# dqn
|
||||
EmbedderScheme.Medium:
|
||||
[
|
||||
Dense([256])
|
||||
],
|
||||
|
||||
# carla
|
||||
EmbedderScheme.Deep: \
|
||||
[
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
|
||||
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
|
||||
name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
|
||||
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
|
||||
input_rescaling, input_offset, input_clipping)
|
||||
|
||||
self.return_type = InputVectorEmbedding
|
||||
if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
|
||||
raise ValueError("The input size of a vector embedder must contain only a single dimension")
|
||||
344
rl_coach/architectures/tensorflow_components/general_network.py
Normal file
344
rl_coach/architectures/tensorflow_components/general_network.py
Normal file
@@ -0,0 +1,344 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import copy
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
|
||||
from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
|
||||
from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
|
||||
from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
|
||||
from rl_coach.core_types import PredictionType
|
||||
|
||||
|
||||
class GeneralTensorFlowNetwork(TensorFlowArchitecture):
|
||||
"""
|
||||
A generalized version of all possible networks implemented using tensorflow.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
|
||||
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
|
||||
"""
|
||||
:param agent_parameters: the agent parameters
|
||||
:param spaces: the spaces definition of the agent
|
||||
:param name: the name of the network
|
||||
:param global_network: the global network replica that is shared between all the workers
|
||||
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
|
||||
:param network_is_trainable: is the network trainable (we can apply gradients on it)
|
||||
"""
|
||||
self.global_network = global_network
|
||||
self.network_is_local = network_is_local
|
||||
self.network_wrapper_name = name.split('/')[0]
|
||||
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
|
||||
self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
|
||||
len(self.network_parameters.heads_parameters)
|
||||
self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
|
||||
len(self.network_parameters.heads_parameters)
|
||||
|
||||
self.gradients_from_head_rescalers = []
|
||||
self.gradients_from_head_rescalers_placeholders = []
|
||||
self.update_head_rescaler_value_ops = []
|
||||
|
||||
self.adaptive_learning_rate_scheme = None
|
||||
self.current_learning_rate = None
|
||||
|
||||
# init network modules containers
|
||||
self.input_embedders = []
|
||||
self.output_heads = []
|
||||
super().__init__(agent_parameters, spaces, name, global_network,
|
||||
network_is_local, network_is_trainable)
|
||||
|
||||
def fill_return_types():
|
||||
ret_dict = {}
|
||||
for cls in get_all_subclasses(PredictionType):
|
||||
ret_dict[cls] = []
|
||||
components = self.input_embedders + [self.middleware] + self.output_heads
|
||||
for component in components:
|
||||
if not hasattr(component, 'return_type'):
|
||||
raise ValueError("{} has no return_type attribute. This should not happen.")
|
||||
if component.return_type is not None:
|
||||
ret_dict[component.return_type].append(component)
|
||||
|
||||
return ret_dict
|
||||
|
||||
self.available_return_types = fill_return_types()
|
||||
|
||||
def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
|
||||
prediction_type: PredictionType) -> Dict[str, np.ndarray]:
|
||||
"""
|
||||
Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
|
||||
predictions for it.
|
||||
|
||||
:param states: The input states to the network.
|
||||
:param prediction_type: The requested PredictionType to look for in the network components
|
||||
:return: A dictionary with predictions for all components matching the requested prediction type
|
||||
"""
|
||||
|
||||
ret_dict = {}
|
||||
for component in self.available_return_types[prediction_type]:
|
||||
ret_dict[component] = self.predict(inputs=states, outputs=component.output)
|
||||
|
||||
return ret_dict
|
||||
|
||||
@staticmethod
|
||||
def get_activation_function(activation_function_string: str):
|
||||
"""
|
||||
Map the activation function from a string to the tensorflow framework equivalent
|
||||
:param activation_function_string: the type of the activation function
|
||||
:return: the tensorflow activation function
|
||||
"""
|
||||
activation_functions = {
|
||||
'relu': tf.nn.relu,
|
||||
'tanh': tf.nn.tanh,
|
||||
'sigmoid': tf.nn.sigmoid,
|
||||
'elu': tf.nn.elu,
|
||||
'selu': tf.nn.selu,
|
||||
'leaky_relu': tf.nn.leaky_relu,
|
||||
'none': None
|
||||
}
|
||||
assert activation_function_string in activation_functions.keys(), \
|
||||
"Activation function must be one of the following {}. instead it was: {}"\
|
||||
.format(activation_functions.keys(), activation_function_string)
|
||||
return activation_functions[activation_function_string]
|
||||
|
||||
def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
|
||||
"""
|
||||
Given an input embedder parameters class, creates the input embedder and returns it
|
||||
:param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
|
||||
be a value within the state or the action.
|
||||
:param embedder_params: the parameters of the class of the embedder
|
||||
:return: the embedder instance
|
||||
"""
|
||||
allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
|
||||
allowed_inputs["action"] = copy.copy(self.spaces.action)
|
||||
allowed_inputs["goal"] = copy.copy(self.spaces.goal)
|
||||
|
||||
if input_name not in allowed_inputs.keys():
|
||||
raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
|
||||
.format(input_name, allowed_inputs.keys()))
|
||||
|
||||
type = "vector"
|
||||
if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
|
||||
type = "image"
|
||||
|
||||
embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
|
||||
embedder_params_copy = copy.copy(embedder_params)
|
||||
embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
|
||||
embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
|
||||
embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
|
||||
embedder_params_copy.name = input_name
|
||||
module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
|
||||
path=embedder_path,
|
||||
positional_args=[allowed_inputs[input_name].shape])
|
||||
return module
|
||||
|
||||
def get_middleware(self, middleware_params: MiddlewareParameters):
|
||||
"""
|
||||
Given a middleware type, creates the middleware and returns it
|
||||
:param middleware_params: the paramaeters of the middleware class
|
||||
:return: the middleware instance
|
||||
"""
|
||||
middleware_params_copy = copy.copy(middleware_params)
|
||||
middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
|
||||
module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
|
||||
return module
|
||||
|
||||
def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
|
||||
"""
|
||||
Given a head type, creates the head and returns it
|
||||
:param head_params: the parameters of the head to create
|
||||
:param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
|
||||
the path should be in the following structure: <module_path>:<class_path>
|
||||
:param head_idx: the head index
|
||||
:param loss_weight: the weight to assign for the embedders loss
|
||||
:return: the head
|
||||
"""
|
||||
|
||||
head_params_copy = copy.copy(head_params)
|
||||
head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
|
||||
return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
|
||||
'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
|
||||
'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
|
||||
|
||||
def get_model(self):
|
||||
# validate the configuration
|
||||
if len(self.network_parameters.input_embedders_parameters) == 0:
|
||||
raise ValueError("At least one input type should be defined")
|
||||
|
||||
if len(self.network_parameters.heads_parameters) == 0:
|
||||
raise ValueError("At least one output type should be defined")
|
||||
|
||||
if self.network_parameters.middleware_parameters is None:
|
||||
raise ValueError("Exactly one middleware type should be defined")
|
||||
|
||||
if len(self.network_parameters.loss_weights) == 0:
|
||||
raise ValueError("At least one loss weight should be defined")
|
||||
|
||||
if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
|
||||
raise ValueError("Number of loss weights should match the number of output types")
|
||||
|
||||
for network_idx in range(self.num_networks):
|
||||
with tf.variable_scope('network_{}'.format(network_idx)):
|
||||
|
||||
####################
|
||||
# Input Embeddings #
|
||||
####################
|
||||
|
||||
state_embedding = []
|
||||
for input_name in sorted(self.network_parameters.input_embedders_parameters):
|
||||
input_type = self.network_parameters.input_embedders_parameters[input_name]
|
||||
# get the class of the input embedder
|
||||
input_embedder = self.get_input_embedder(input_name, input_type)
|
||||
self.input_embedders.append(input_embedder)
|
||||
|
||||
# input placeholders are reused between networks. on the first network, store the placeholders
|
||||
# generated by the input_embedders in self.inputs. on the rest of the networks, pass
|
||||
# the existing input_placeholders into the input_embedders.
|
||||
if network_idx == 0:
|
||||
input_placeholder, embedding = input_embedder()
|
||||
self.inputs[input_name] = input_placeholder
|
||||
else:
|
||||
input_placeholder, embedding = input_embedder(self.inputs[input_name])
|
||||
|
||||
state_embedding.append(embedding)
|
||||
|
||||
##########
|
||||
# Merger #
|
||||
##########
|
||||
|
||||
if len(state_embedding) == 1:
|
||||
state_embedding = state_embedding[0]
|
||||
else:
|
||||
if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
|
||||
state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
|
||||
elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
|
||||
state_embedding = tf.add_n(state_embedding, name="merger")
|
||||
|
||||
##############
|
||||
# Middleware #
|
||||
##############
|
||||
|
||||
self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
|
||||
_, self.state_embedding = self.middleware(state_embedding)
|
||||
|
||||
################
|
||||
# Output Heads #
|
||||
################
|
||||
|
||||
head_count = 0
|
||||
for head_idx in range(self.num_heads_per_network):
|
||||
for head_copy_idx in range(self.network_parameters.num_output_head_copies):
|
||||
if self.network_parameters.use_separate_networks_per_head:
|
||||
# if we use separate networks per head, then the head type corresponds top the network idx
|
||||
head_type_idx = network_idx
|
||||
head_count = network_idx
|
||||
else:
|
||||
# if we use a single network with multiple embedders, then the head type is the current head idx
|
||||
head_type_idx = head_idx
|
||||
self.output_heads.append(
|
||||
self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
|
||||
head_copy_idx,
|
||||
self.network_parameters.loss_weights[head_type_idx])
|
||||
)
|
||||
|
||||
# rescale the gradients from the head
|
||||
self.gradients_from_head_rescalers.append(
|
||||
tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
|
||||
initializer=float(
|
||||
self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
|
||||
),
|
||||
dtype=tf.float32))
|
||||
|
||||
self.gradients_from_head_rescalers_placeholders.append(
|
||||
tf.placeholder('float',
|
||||
name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
|
||||
|
||||
self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
|
||||
self.gradients_from_head_rescalers_placeholders[head_count]))
|
||||
|
||||
head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
|
||||
self.gradients_from_head_rescalers[head_count] * self.state_embedding
|
||||
|
||||
# build the head
|
||||
if self.network_is_local:
|
||||
output, target_placeholder, input_placeholders, importance_weight_ph = \
|
||||
self.output_heads[-1](head_input)
|
||||
|
||||
self.targets.extend(target_placeholder)
|
||||
self.importance_weights.extend(importance_weight_ph)
|
||||
else:
|
||||
output, input_placeholders = self.output_heads[-1](head_input)
|
||||
|
||||
self.outputs.extend(output)
|
||||
# TODO: use head names as well
|
||||
for placeholder_index, input_placeholder in enumerate(input_placeholders):
|
||||
self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
|
||||
|
||||
head_count += 1
|
||||
|
||||
# Losses
|
||||
self.losses = tf.losses.get_losses(self.full_name)
|
||||
self.losses += tf.losses.get_regularization_losses(self.full_name)
|
||||
self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
|
||||
# tf.summary.scalar('total_loss', self.total_loss)
|
||||
|
||||
# Learning rate
|
||||
if self.network_parameters.learning_rate_decay_rate != 0:
|
||||
self.adaptive_learning_rate_scheme = \
|
||||
tf.train.exponential_decay(
|
||||
self.network_parameters.learning_rate,
|
||||
self.global_step,
|
||||
decay_steps=self.network_parameters.learning_rate_decay_steps,
|
||||
decay_rate=self.network_parameters.learning_rate_decay_rate,
|
||||
staircase=True)
|
||||
|
||||
self.current_learning_rate = self.adaptive_learning_rate_scheme
|
||||
else:
|
||||
self.current_learning_rate = self.network_parameters.learning_rate
|
||||
|
||||
# Optimizer
|
||||
if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
|
||||
# distributed training + is a local network + optimizer shared -> take the global optimizer
|
||||
self.optimizer = self.global_network.optimizer
|
||||
elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
|
||||
or self.network_parameters.shared_optimizer or not self.distributed_training:
|
||||
# distributed training + is a global network + optimizer shared
|
||||
# OR
|
||||
# distributed training + is a local network + optimizer not shared
|
||||
# OR
|
||||
# non-distributed training
|
||||
# -> create an optimizer
|
||||
|
||||
if self.network_parameters.optimizer_type == 'Adam':
|
||||
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
|
||||
beta1=self.network_parameters.adam_optimizer_beta1,
|
||||
beta2=self.network_parameters.adam_optimizer_beta2,
|
||||
epsilon=self.network_parameters.optimizer_epsilon)
|
||||
elif self.network_parameters.optimizer_type == 'RMSProp':
|
||||
self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
|
||||
decay=self.network_parameters.rms_prop_optimizer_decay,
|
||||
epsilon=self.network_parameters.optimizer_epsilon)
|
||||
elif self.network_parameters.optimizer_type == 'LBFGS':
|
||||
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
|
||||
options={'maxiter': 25})
|
||||
else:
|
||||
raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))
|
||||
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class CategoricalQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
|
||||
super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class CategoricalQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'categorical_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms
|
||||
self.return_type = QActionStateValue
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = tf.placeholder(tf.int32, [None], name="actions")
|
||||
self.input = [self.actions]
|
||||
|
||||
values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
|
||||
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
|
||||
self.num_atoms))
|
||||
# softmax on atoms dimension
|
||||
self.output = tf.nn.softmax(values_distribution)
|
||||
|
||||
# calculate cross entropy loss
|
||||
self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
|
||||
name="distributions")
|
||||
self.target = self.distributions
|
||||
self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
|
||||
tf.losses.add_loss(self.loss)
|
||||
@@ -0,0 +1,66 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class DDPGActorHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
|
||||
super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
|
||||
self.batchnorm = batchnorm
|
||||
|
||||
|
||||
class DDPGActor(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
|
||||
batchnorm: bool=True):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ddpg_actor_head'
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
self.num_actions = self.spaces.action.shape
|
||||
|
||||
self.batchnorm = batchnorm
|
||||
|
||||
# bounded actions
|
||||
self.output_scale = self.spaces.action.max_abs_range
|
||||
|
||||
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
|
||||
if hasattr(agent_parameters.algorithm, 'action_penalty'):
|
||||
self.action_penalty = agent_parameters.algorithm.action_penalty
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# mean
|
||||
pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
|
||||
policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
|
||||
self.activation_function,
|
||||
False, 0, 0)[-1]
|
||||
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
|
||||
|
||||
if self.is_local:
|
||||
# add a squared penalty on the squared pre-activation features of the action
|
||||
if self.action_penalty and self.action_penalty != 0:
|
||||
self.regularizations += \
|
||||
[self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
|
||||
|
||||
self.output = [self.policy_mean]
|
||||
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.memories.non_episodic import differentiable_neural_dictionary
|
||||
|
||||
|
||||
class DNDQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
|
||||
super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class DNDQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'dnd_q_values_head'
|
||||
self.DND_size = agent_parameters.algorithm.dnd_size
|
||||
self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
|
||||
self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
|
||||
self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
|
||||
self.number_of_nn = agent_parameters.algorithm.number_of_knn
|
||||
self.ap = agent_parameters
|
||||
self.dnd_embeddings = [None] * self.num_actions
|
||||
self.dnd_values = [None] * self.num_actions
|
||||
self.dnd_indices = [None] * self.num_actions
|
||||
self.dnd_distances = [None] * self.num_actions
|
||||
if self.ap.memory.shared_memory:
|
||||
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
|
||||
self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
|
||||
else:
|
||||
self.DND = differentiable_neural_dictionary.QDND(
|
||||
self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
|
||||
key_error_threshold=self.DND_key_error_threshold,
|
||||
learning_rate=self.network_parameters.learning_rate,
|
||||
num_neighbors=self.number_of_nn,
|
||||
override_existing_keys=True)
|
||||
|
||||
# Retrieve info from DND dictionary
|
||||
# We assume that all actions have enough entries in the DND
|
||||
self.output = tf.transpose([
|
||||
self._q_value(input_layer, action)
|
||||
for action in range(self.num_actions)
|
||||
])
|
||||
|
||||
def _q_value(self, input_layer, action):
|
||||
result = tf.py_func(self.DND.query,
|
||||
[input_layer, action, self.number_of_nn],
|
||||
[tf.float64, tf.float64, tf.int64])
|
||||
self.dnd_embeddings[action] = tf.to_float(result[0])
|
||||
self.dnd_values[action] = tf.to_float(result[1])
|
||||
self.dnd_indices[action] = result[2]
|
||||
|
||||
# DND calculation
|
||||
square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
|
||||
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
|
||||
self.dnd_distances[action] = distances
|
||||
weights = 1.0 / distances
|
||||
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
|
||||
q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
|
||||
q_value.set_shape((None,))
|
||||
return q_value
|
||||
|
||||
def _post_build(self):
|
||||
# DND gradients
|
||||
self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
|
||||
self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)
|
||||
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
|
||||
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
|
||||
class DuelingQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
|
||||
super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class DuelingQHead(QHead):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'dueling_q_values_head'
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# state value tower - V
|
||||
with tf.variable_scope("state_value"):
|
||||
state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
|
||||
state_value = tf.layers.dense(state_value, 1, name='fc2')
|
||||
# state_value = tf.expand_dims(state_value, axis=-1)
|
||||
|
||||
# action advantage tower - A
|
||||
with tf.variable_scope("action_advantage"):
|
||||
action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
|
||||
action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
|
||||
action_advantage = action_advantage - tf.reduce_mean(action_advantage)
|
||||
|
||||
# merge to state-action value function Q
|
||||
self.output = tf.add(state_value, action_advantage, name='output')
|
||||
165
rl_coach/architectures/tensorflow_components/heads/head.py
Normal file
165
rl_coach/architectures/tensorflow_components/heads/head.py
Normal file
@@ -0,0 +1,165 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Type
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters, Parameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from tensorflow.python.ops.losses.losses_impl import Reduction
|
||||
|
||||
from rl_coach.utils import force_list
|
||||
|
||||
|
||||
# Used to initialize weights for policy and value output layers
|
||||
def normalized_columns_initializer(std=1.0):
|
||||
def _initializer(shape, dtype=None, partition_info=None):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
return _initializer
|
||||
|
||||
|
||||
class HeadParameters(Parameters):
|
||||
def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
|
||||
super().__init__()
|
||||
self.activation_function = activation_function
|
||||
self.name = name
|
||||
self.parameterized_class_name = parameterized_class.__name__
|
||||
|
||||
|
||||
class Head(object):
|
||||
"""
|
||||
A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
|
||||
a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
|
||||
an assigned loss function. The heads are algorithm dependent.
|
||||
"""
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
|
||||
self.head_idx = head_idx
|
||||
self.network_name = network_name
|
||||
self.network_parameters = agent_parameters.network_wrappers[self.network_name]
|
||||
self.name = "head"
|
||||
self.output = []
|
||||
self.loss = []
|
||||
self.loss_type = []
|
||||
self.regularizations = []
|
||||
self.loss_weight = force_list(loss_weight)
|
||||
self.target = []
|
||||
self.importance_weight = []
|
||||
self.input = []
|
||||
self.is_local = is_local
|
||||
self.ap = agent_parameters
|
||||
self.spaces = spaces
|
||||
self.return_type = None
|
||||
self.activation_function = activation_function
|
||||
|
||||
def __call__(self, input_layer):
|
||||
"""
|
||||
Wrapper for building the module graph including scoping and loss creation
|
||||
:param input_layer: the input to the graph
|
||||
:return: the output of the last layer and the target placeholder
|
||||
"""
|
||||
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
|
||||
self._build_module(input_layer)
|
||||
|
||||
self.output = force_list(self.output)
|
||||
self.target = force_list(self.target)
|
||||
self.input = force_list(self.input)
|
||||
self.loss_type = force_list(self.loss_type)
|
||||
self.loss = force_list(self.loss)
|
||||
self.regularizations = force_list(self.regularizations)
|
||||
if self.is_local:
|
||||
self.set_loss()
|
||||
self._post_build()
|
||||
|
||||
if self.is_local:
|
||||
return self.output, self.target, self.input, self.importance_weight
|
||||
else:
|
||||
return self.output, self.input
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
"""
|
||||
Builds the graph of the module
|
||||
This method is called early on from __call__. It is expected to store the graph
|
||||
in self.output.
|
||||
:param input_layer: the input to the graph
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def _post_build(self):
|
||||
"""
|
||||
Optional function that allows adding any extra definitions after the head has been fully defined
|
||||
For example, this allows doing additional calculations that are based on the loss
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
"""
|
||||
Get a formatted name for the module
|
||||
:return: the formatted name
|
||||
"""
|
||||
return '{}_{}'.format(self.name, self.head_idx)
|
||||
|
||||
def set_loss(self):
|
||||
"""
|
||||
Creates a target placeholder and loss function for each loss_type and regularization
|
||||
:param loss_type: a tensorflow loss function
|
||||
:param scope: the name scope to include the tensors in
|
||||
:return: None
|
||||
"""
|
||||
|
||||
# there are heads that define the loss internally, but we need to create additional placeholders for them
|
||||
for idx in range(len(self.loss)):
|
||||
importance_weight = tf.placeholder('float',
|
||||
[None] + [1] * (len(self.target[idx].shape) - 1),
|
||||
'{}_importance_weight'.format(self.get_name()))
|
||||
self.importance_weight.append(importance_weight)
|
||||
|
||||
# add losses and target placeholder
|
||||
for idx in range(len(self.loss_type)):
|
||||
# create target placeholder
|
||||
target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
|
||||
self.target.append(target)
|
||||
|
||||
# create importance sampling weights placeholder
|
||||
num_target_dims = len(self.target[idx].shape)
|
||||
importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
|
||||
'{}_importance_weight'.format(self.get_name()))
|
||||
self.importance_weight.append(importance_weight)
|
||||
|
||||
# compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
|
||||
# weights the specific loss of this head against other losses in this head or in other heads
|
||||
loss_weight = self.loss_weight[idx]*importance_weight
|
||||
loss = self.loss_type[idx](self.target[-1], self.output[idx],
|
||||
scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
|
||||
|
||||
# the loss is first summed over each sample in the batch and then the mean over the batch is taken
|
||||
loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
|
||||
|
||||
# we add the loss to the losses collection and later we will extract it in general_network
|
||||
tf.losses.add_loss(loss)
|
||||
self.loss.append(loss)
|
||||
|
||||
# add regularizations
|
||||
for regularization in self.regularizations:
|
||||
self.loss.append(regularization)
|
||||
|
||||
@classmethod
|
||||
def path(cls):
|
||||
return cls.__class__.__name__
|
||||
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import Measurements
|
||||
|
||||
|
||||
class MeasurementsPredictionHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
|
||||
super().__init__(parameterized_class=MeasurementsPredictionHead,
|
||||
activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class MeasurementsPredictionHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'future_measurements_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_measurements = self.spaces.state['measurements'].shape[0]
|
||||
self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
|
||||
self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
|
||||
self.return_type = Measurements
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
|
||||
# actions expectation tower (expectation stream) - E
|
||||
with tf.variable_scope("expectation_stream"):
|
||||
expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
|
||||
expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
|
||||
expectation_stream = tf.expand_dims(expectation_stream, axis=1)
|
||||
|
||||
# action fine differences tower (action stream) - A
|
||||
with tf.variable_scope("action_stream"):
|
||||
action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
|
||||
action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
|
||||
name='output')
|
||||
action_stream = tf.reshape(action_stream,
|
||||
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
|
||||
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
|
||||
|
||||
# merge to future measurements predictions
|
||||
self.output = tf.add(expectation_stream, action_stream, name='output')
|
||||
self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
|
||||
name="targets")
|
||||
targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
|
||||
self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
|
||||
tf.losses.add_loss(self.loss_weight[0] * self.loss)
|
||||
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import BoxActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class NAFHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
|
||||
super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class NAFHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
if not isinstance(self.spaces.action, BoxActionSpace):
|
||||
raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
|
||||
|
||||
self.name = 'naf_q_values_head'
|
||||
self.num_actions = self.spaces.action.shape[0]
|
||||
self.output_scale = self.spaces.action.max_abs_range
|
||||
self.return_type = QActionStateValue
|
||||
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# NAF
|
||||
self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
|
||||
self.input = self.action
|
||||
|
||||
# V Head
|
||||
self.V = tf.layers.dense(input_layer, 1, name='V')
|
||||
|
||||
# mu Head
|
||||
mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
|
||||
self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
|
||||
|
||||
# A Head
|
||||
# l_vector is a vector that includes a lower-triangular matrix values
|
||||
self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
|
||||
|
||||
# Convert l to a lower triangular matrix and exponentiate its diagonal
|
||||
|
||||
i = 0
|
||||
columns = []
|
||||
for col in range(self.num_actions):
|
||||
start_row = col
|
||||
num_non_zero_elements = self.num_actions - start_row
|
||||
zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
|
||||
diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
|
||||
non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
|
||||
columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
|
||||
i += num_non_zero_elements
|
||||
self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
|
||||
|
||||
# P = L*L^T
|
||||
self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
|
||||
|
||||
# A = -1/2 * (u - mu)^T * P * (u - mu)
|
||||
action_diff = tf.expand_dims(self.action - self.mu, -1)
|
||||
a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
|
||||
self.A = tf.reshape(a_matrix_form, [-1, 1])
|
||||
|
||||
# Q Head
|
||||
self.Q = tf.add(self.V, self.A, name='Q')
|
||||
|
||||
self.output = self.Q
|
||||
@@ -0,0 +1,151 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
|
||||
|
||||
|
||||
class PolicyHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
|
||||
super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PolicyHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'policy_values_head'
|
||||
self.return_type = ActionProbabilities
|
||||
self.beta = None
|
||||
self.action_penalty = None
|
||||
|
||||
self.exploration_policy = agent_parameters.exploration
|
||||
|
||||
# a scalar weight that penalizes low entropy values to encourage exploration
|
||||
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
|
||||
self.beta = agent_parameters.algorithm.beta_entropy
|
||||
|
||||
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
|
||||
if hasattr(agent_parameters.algorithm, 'action_penalty'):
|
||||
self.action_penalty = agent_parameters.algorithm.action_penalty
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = []
|
||||
self.input = self.actions
|
||||
self.policy_distributions = []
|
||||
self.output = []
|
||||
|
||||
action_spaces = [self.spaces.action]
|
||||
if isinstance(self.spaces.action, CompoundActionSpace):
|
||||
action_spaces = self.spaces.action.sub_action_spaces
|
||||
|
||||
# create a compound action network
|
||||
for action_space_idx, action_space in enumerate(action_spaces):
|
||||
with tf.variable_scope("sub_action_{}".format(action_space_idx)):
|
||||
if isinstance(action_space, DiscreteActionSpace):
|
||||
# create a discrete action network (softmax probabilities output)
|
||||
self._build_discrete_net(input_layer, action_space)
|
||||
elif isinstance(action_space, BoxActionSpace):
|
||||
# create a continuous action network (bounded mean and stdev outputs)
|
||||
self._build_continuous_net(input_layer, action_space)
|
||||
|
||||
if self.is_local:
|
||||
# add entropy regularization
|
||||
if self.beta:
|
||||
self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
|
||||
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
|
||||
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
# calculate loss
|
||||
self.action_log_probs_wrt_policy = \
|
||||
tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
|
||||
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
|
||||
self.target = self.advantages
|
||||
self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
|
||||
tf.losses.add_loss(self.loss_weight[0] * self.loss)
|
||||
|
||||
def _build_discrete_net(self, input_layer, action_space):
|
||||
num_actions = len(action_space.actions)
|
||||
self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
|
||||
|
||||
policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
|
||||
self.policy_probs = tf.nn.softmax(policy_values, name="policy")
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
|
||||
policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
|
||||
self.policy_distributions.append(policy_distribution)
|
||||
self.output.append(self.policy_probs)
|
||||
|
||||
def _build_continuous_net(self, input_layer, action_space):
|
||||
num_actions = action_space.shape
|
||||
self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
|
||||
|
||||
# output activation function
|
||||
if np.all(self.spaces.action.max_abs_range < np.inf):
|
||||
# bounded actions
|
||||
self.output_scale = action_space.max_abs_range
|
||||
self.continuous_output_activation = self.activation_function
|
||||
else:
|
||||
# unbounded actions
|
||||
self.output_scale = 1
|
||||
self.continuous_output_activation = None
|
||||
|
||||
# mean
|
||||
pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
|
||||
policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
|
||||
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
|
||||
|
||||
self.output.append(self.policy_mean)
|
||||
|
||||
# standard deviation
|
||||
if isinstance(self.exploration_policy, ContinuousEntropyParameters):
|
||||
# the stdev is an output of the network and uses a softplus activation as defined in A3C
|
||||
policy_values_std = tf.layers.dense(input_layer, num_actions,
|
||||
kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
|
||||
self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
|
||||
|
||||
self.output.append(self.policy_std)
|
||||
else:
|
||||
# the stdev is an externally given value
|
||||
# Warning: we need to explicitly put this variable in the local variables collections, since defining
|
||||
# it as not trainable puts it for some reason in the global variables collections. If this is not done,
|
||||
# the variable won't be initialized and when working with multiple workers they will get stuck.
|
||||
self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
|
||||
name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
|
||||
# assign op for the policy std
|
||||
self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
|
||||
self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
|
||||
self.policy_distributions.append(policy_distribution)
|
||||
|
||||
if self.is_local:
|
||||
# add a squared penalty on the squared pre-activation features of the action
|
||||
if self.action_penalty and self.action_penalty != 0:
|
||||
self.regularizations += [
|
||||
self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
|
||||
144
rl_coach/architectures/tensorflow_components/heads/ppo_head.py
Normal file
144
rl_coach/architectures/tensorflow_components/heads/ppo_head.py
Normal file
@@ -0,0 +1,144 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
from rl_coach.utils import eps
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class PPOHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
|
||||
super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PPOHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ppo_head'
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
# used in regular PPO
|
||||
self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
|
||||
if self.use_kl_regularization:
|
||||
# kl coefficient and its corresponding assignment operation and placeholder
|
||||
self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
|
||||
trainable=False, name='kl_coefficient')
|
||||
self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
|
||||
self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
|
||||
self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
|
||||
self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
|
||||
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.beta = agent_parameters.algorithm.beta_entropy
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
if isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
self._build_discrete_net(input_layer, self.spaces.action)
|
||||
elif isinstance(self.spaces.action, BoxActionSpace):
|
||||
self._build_continuous_net(input_layer, self.spaces.action)
|
||||
else:
|
||||
raise ValueError("only discrete or continuous action spaces are supported for PPO")
|
||||
|
||||
self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
|
||||
self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
|
||||
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
|
||||
|
||||
# Used by regular PPO only
|
||||
# add kl divergence regularization
|
||||
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
|
||||
|
||||
if self.use_kl_regularization:
|
||||
# no clipping => use kl regularization
|
||||
self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
|
||||
self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
|
||||
tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
# calculate surrogate loss
|
||||
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
|
||||
self.target = self.advantages
|
||||
# action_probs_wrt_old_policy != 0 because it is e^...
|
||||
self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
|
||||
if self.clip_likelihood_ratio_using_epsilon is not None:
|
||||
self.clip_param_rescaler = tf.placeholder(tf.float32, ())
|
||||
self.input.append(self.clip_param_rescaler)
|
||||
max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
|
||||
min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
|
||||
self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
|
||||
self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
|
||||
self.clipped_likelihood_ratio * self.advantages)
|
||||
else:
|
||||
self.scaled_advantages = self.likelihood_ratio * self.advantages
|
||||
# minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
|
||||
self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
|
||||
if self.is_local:
|
||||
# add entropy regularization
|
||||
if self.beta:
|
||||
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
|
||||
self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
|
||||
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
|
||||
|
||||
self.loss = self.surrogate_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
|
||||
def _build_discrete_net(self, input_layer, action_space):
|
||||
num_actions = len(action_space.actions)
|
||||
self.actions = tf.placeholder(tf.int32, [None], name="actions")
|
||||
|
||||
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
|
||||
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
|
||||
|
||||
# Policy Head
|
||||
self.input = [self.actions, self.old_policy_mean]
|
||||
policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
|
||||
self.policy_mean = tf.nn.softmax(policy_values, name="policy")
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
|
||||
self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
|
||||
|
||||
self.output = self.policy_mean
|
||||
|
||||
def _build_continuous_net(self, input_layer, action_space):
|
||||
num_actions = action_space.shape[0]
|
||||
self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
|
||||
|
||||
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
|
||||
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
|
||||
|
||||
self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
|
||||
self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
|
||||
kernel_initializer=normalized_columns_initializer(0.01))
|
||||
if self.is_local:
|
||||
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
|
||||
collections=[tf.GraphKeys.LOCAL_VARIABLES])
|
||||
else:
|
||||
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
|
||||
|
||||
self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
|
||||
|
||||
# define the distributions for the policy and the old policy
|
||||
self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
|
||||
self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
|
||||
|
||||
self.output = [self.policy_mean, self.policy_std]
|
||||
@@ -0,0 +1,52 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.core_types import ActionProbabilities
|
||||
|
||||
|
||||
class PPOVHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
|
||||
super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class PPOVHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'ppo_v_head'
|
||||
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
|
||||
self.return_type = ActionProbabilities
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
|
||||
self.input = [self.old_policy_value]
|
||||
self.output = tf.layers.dense(input_layer, 1, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
|
||||
|
||||
value_loss_1 = tf.square(self.output - self.target)
|
||||
value_loss_2 = tf.square(self.old_policy_value +
|
||||
tf.clip_by_value(self.output - self.old_policy_value,
|
||||
-self.clip_likelihood_ratio_using_epsilon,
|
||||
self.clip_likelihood_ratio_using_epsilon) - self.target)
|
||||
self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
|
||||
self.loss = self.vf_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
50
rl_coach/architectures/tensorflow_components/heads/q_head.py
Normal file
50
rl_coach/architectures/tensorflow_components/heads/q_head.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class QHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
|
||||
super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class QHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'q_values_head'
|
||||
if isinstance(self.spaces.action, BoxActionSpace):
|
||||
self.num_actions = 1
|
||||
elif isinstance(self.spaces.action, DiscreteActionSpace):
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.return_type = QActionStateValue
|
||||
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard Q Network
|
||||
self.output = tf.layers.dense(input_layer, self.num_actions, name='output')
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
|
||||
from rl_coach.core_types import QActionStateValue
|
||||
|
||||
|
||||
class QuantileRegressionQHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
|
||||
super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
|
||||
name=name)
|
||||
|
||||
|
||||
class QuantileRegressionQHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'quantile_regression_dqn_head'
|
||||
self.num_actions = len(self.spaces.action.actions)
|
||||
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
|
||||
self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval # k
|
||||
self.return_type = QActionStateValue
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
|
||||
self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
|
||||
self.input = [self.actions, self.quantile_midpoints]
|
||||
|
||||
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
|
||||
quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
|
||||
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
|
||||
self.output = quantiles_locations
|
||||
|
||||
self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
|
||||
self.target = self.quantiles
|
||||
|
||||
# only the quantiles of the taken action are taken into account
|
||||
quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
|
||||
|
||||
# reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
|
||||
# the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
|
||||
# the target quantiles vector is tiled as column of a NxN matrix
|
||||
theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
|
||||
T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
|
||||
tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
|
||||
|
||||
# Huber loss of T(theta_j) - theta_i
|
||||
error = T_theta_j - theta_i
|
||||
abs_error = tf.abs(error)
|
||||
quadratic = tf.minimum(abs_error, self.huber_loss_interval)
|
||||
huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
|
||||
|
||||
# Quantile Huber loss
|
||||
quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
|
||||
|
||||
# Quantile regression loss (the probability for each quantile is 1/num_quantiles)
|
||||
quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
|
||||
self.loss = quantile_regression_loss
|
||||
tf.losses.add_loss(self.loss)
|
||||
45
rl_coach/architectures/tensorflow_components/heads/v_head.py
Normal file
45
rl_coach/architectures/tensorflow_components/heads/v_head.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import AgentParameters
|
||||
from rl_coach.spaces import SpacesDefinition
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
|
||||
from rl_coach.core_types import VStateValue
|
||||
|
||||
|
||||
class VHeadParameters(HeadParameters):
|
||||
def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
|
||||
super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
|
||||
|
||||
|
||||
class VHead(Head):
|
||||
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
|
||||
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
|
||||
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
|
||||
self.name = 'v_values_head'
|
||||
self.return_type = VStateValue
|
||||
|
||||
if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
|
||||
self.loss_type = tf.losses.huber_loss
|
||||
else:
|
||||
self.loss_type = tf.losses.mean_squared_error
|
||||
|
||||
def _build_module(self, input_layer):
|
||||
# Standard V Network
|
||||
self.output = tf.layers.dense(input_layer, 1, name='output',
|
||||
kernel_initializer=normalized_columns_initializer(1.0))
|
||||
@@ -0,0 +1,86 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Union, List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
|
||||
from rl_coach.base_parameters import MiddlewareScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
|
||||
from rl_coach.core_types import Middleware_FC_Embedding
|
||||
|
||||
|
||||
class FCMiddlewareParameters(MiddlewareParameters):
|
||||
def __init__(self, activation_function='relu',
|
||||
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_fc_embedder"):
|
||||
super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
|
||||
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
|
||||
|
||||
|
||||
class FCMiddleware(Middleware):
|
||||
schemes = {
|
||||
MiddlewareScheme.Empty:
|
||||
[],
|
||||
|
||||
# ppo
|
||||
MiddlewareScheme.Shallow:
|
||||
[
|
||||
Dense([64])
|
||||
],
|
||||
|
||||
# dqn
|
||||
MiddlewareScheme.Medium:
|
||||
[
|
||||
Dense([512])
|
||||
],
|
||||
|
||||
MiddlewareScheme.Deep: \
|
||||
[
|
||||
Dense([128]),
|
||||
Dense([128]),
|
||||
Dense([128])
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, activation_function=tf.nn.relu,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_fc_embedder"):
|
||||
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
|
||||
dropout=dropout, scheme=scheme, name=name)
|
||||
self.return_type = Middleware_FC_Embedding
|
||||
self.layers = []
|
||||
|
||||
def _build_module(self):
|
||||
self.layers.append(self.input)
|
||||
|
||||
if isinstance(self.scheme, MiddlewareScheme):
|
||||
layers_params = FCMiddleware.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
self.output = self.layers[-1]
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
|
||||
from rl_coach.base_parameters import MiddlewareScheme
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
|
||||
from rl_coach.core_types import Middleware_LSTM_Embedding
|
||||
|
||||
|
||||
class LSTMMiddlewareParameters(MiddlewareParameters):
|
||||
def __init__(self, activation_function='relu', number_of_lstm_cells=256,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_lstm_embedder"):
|
||||
super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
|
||||
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
|
||||
self.number_of_lstm_cells = number_of_lstm_cells
|
||||
|
||||
|
||||
class LSTMMiddleware(Middleware):
|
||||
schemes = {
|
||||
MiddlewareScheme.Empty:
|
||||
[],
|
||||
|
||||
# ppo
|
||||
MiddlewareScheme.Shallow:
|
||||
[
|
||||
[64]
|
||||
],
|
||||
|
||||
# dqn
|
||||
MiddlewareScheme.Medium:
|
||||
[
|
||||
[512]
|
||||
],
|
||||
|
||||
MiddlewareScheme.Deep: \
|
||||
[
|
||||
[128],
|
||||
[128],
|
||||
[128]
|
||||
]
|
||||
}
|
||||
|
||||
def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False,
|
||||
name="middleware_lstm_embedder"):
|
||||
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
|
||||
dropout=dropout, scheme=scheme, name=name)
|
||||
self.return_type = Middleware_LSTM_Embedding
|
||||
self.number_of_lstm_cells = number_of_lstm_cells
|
||||
self.layers = []
|
||||
|
||||
def _build_module(self):
|
||||
"""
|
||||
self.state_in: tuple of placeholders containing the initial state
|
||||
self.state_out: tuple of output state
|
||||
|
||||
todo: it appears that the shape of the output is batch, feature
|
||||
the code here seems to be slicing off the first element in the batch
|
||||
which would definitely be wrong. need to double check the shape
|
||||
"""
|
||||
|
||||
self.layers.append(self.input)
|
||||
|
||||
# optionally insert some dense layers before the LSTM
|
||||
if isinstance(self.scheme, MiddlewareScheme):
|
||||
layers_params = LSTMMiddleware.schemes[self.scheme]
|
||||
else:
|
||||
layers_params = self.scheme
|
||||
for idx, layer_params in enumerate(layers_params):
|
||||
self.layers.append(
|
||||
tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
|
||||
)
|
||||
|
||||
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
|
||||
self.activation_function, self.dropout,
|
||||
self.dropout_rate, idx))
|
||||
|
||||
# add the LSTM layer
|
||||
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
|
||||
self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
|
||||
self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
|
||||
self.state_init = [self.c_init, self.h_init]
|
||||
self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
|
||||
self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
|
||||
self.state_in = (self.c_in, self.h_in)
|
||||
rnn_in = tf.expand_dims(self.layers[-1], [0])
|
||||
step_size = tf.shape(self.layers[-1])[:1]
|
||||
state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
|
||||
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
|
||||
lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
|
||||
lstm_c, lstm_h = lstm_state
|
||||
self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
|
||||
self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
|
||||
@@ -0,0 +1,68 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from typing import Type, Union, List
|
||||
|
||||
import tensorflow as tf
|
||||
from rl_coach.base_parameters import MiddlewareScheme, Parameters
|
||||
|
||||
from rl_coach.core_types import MiddlewareEmbedding
|
||||
|
||||
|
||||
class MiddlewareParameters(Parameters):
|
||||
def __init__(self, parameterized_class: Type['Middleware'],
|
||||
activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
|
||||
batchnorm: bool=False, dropout: bool=False,
|
||||
name='middleware'):
|
||||
super().__init__()
|
||||
self.activation_function = activation_function
|
||||
self.scheme = scheme
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.name = name
|
||||
self.parameterized_class_name = parameterized_class.__name__
|
||||
|
||||
|
||||
class Middleware(object):
|
||||
"""
|
||||
A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
|
||||
after they were aggregated in some method (for example, concatenation) and passes it through a neural network
|
||||
which can be customizable but shared between the heads of the network
|
||||
"""
|
||||
def __init__(self, activation_function=tf.nn.relu,
|
||||
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
|
||||
batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
|
||||
self.name = name
|
||||
self.input = None
|
||||
self.output = None
|
||||
self.activation_function = activation_function
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
self.dropout_rate = 0
|
||||
self.scheme = scheme
|
||||
self.return_type = MiddlewareEmbedding
|
||||
|
||||
def __call__(self, input_layer):
|
||||
with tf.variable_scope(self.get_name()):
|
||||
self.input = input_layer
|
||||
self._build_module()
|
||||
|
||||
return self.input, self.output
|
||||
|
||||
def _build_module(self):
|
||||
pass
|
||||
|
||||
def get_name(self):
|
||||
return self.name
|
||||
121
rl_coach/architectures/tensorflow_components/shared_variables.py
Normal file
121
rl_coach/architectures/tensorflow_components/shared_variables.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class SharedRunningStats(object):
|
||||
def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
|
||||
self.sess = None
|
||||
self.name = name
|
||||
self.replicated_device = replicated_device
|
||||
self.epsilon = epsilon
|
||||
self.ops_were_created = False
|
||||
if create_ops:
|
||||
with tf.device(replicated_device):
|
||||
self.create_ops()
|
||||
|
||||
def create_ops(self, shape=[1], clip_values=None):
|
||||
self.clip_values = clip_values
|
||||
with tf.variable_scope(self.name):
|
||||
self._sum = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
name="running_sum", trainable=False, shape=shape, validate_shape=False,
|
||||
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
self._sum_squared = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
initializer=tf.constant_initializer(self.epsilon),
|
||||
name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
|
||||
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
self._count = tf.get_variable(
|
||||
dtype=tf.float64,
|
||||
shape=(),
|
||||
initializer=tf.constant_initializer(self.epsilon),
|
||||
name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
|
||||
|
||||
self._shape = None
|
||||
self._mean = tf.div(self._sum, self._count, name="mean")
|
||||
self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
|
||||
/ tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
|
||||
self.tf_mean = tf.cast(self._mean, 'float32')
|
||||
self.tf_std = tf.cast(self._std, 'float32')
|
||||
|
||||
self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
|
||||
self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
|
||||
self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
|
||||
|
||||
self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
|
||||
self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
|
||||
self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
|
||||
|
||||
self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
|
||||
self.normalized_obs = (self.raw_obs - self._mean) / self._std
|
||||
if self.clip_values is not None:
|
||||
self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
|
||||
|
||||
self.ops_were_created = True
|
||||
|
||||
def set_session(self, sess):
|
||||
self.sess = sess
|
||||
|
||||
def push(self, x):
|
||||
x = x.astype('float64')
|
||||
self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
|
||||
feed_dict={
|
||||
self.new_sum: x.sum(axis=0).ravel(),
|
||||
self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
|
||||
self.newcount: np.array(len(x), dtype='float64')
|
||||
})
|
||||
if self._shape is None:
|
||||
self._shape = x.shape
|
||||
|
||||
@property
|
||||
def n(self):
|
||||
return self.sess.run(self._count)
|
||||
|
||||
@property
|
||||
def mean(self):
|
||||
return self.sess.run(self._mean)
|
||||
|
||||
@property
|
||||
def var(self):
|
||||
return self.std ** 2
|
||||
|
||||
@property
|
||||
def std(self):
|
||||
return self.sess.run(self._std)
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
return self._shape
|
||||
|
||||
@shape.setter
|
||||
def shape(self, val):
|
||||
self._shape = val
|
||||
self.new_sum.set_shape(val)
|
||||
self.new_sum_squared.set_shape(val)
|
||||
self.tf_mean.set_shape(val)
|
||||
self.tf_std.set_shape(val)
|
||||
self._sum.set_shape(val)
|
||||
self._sum_squared.set_shape(val)
|
||||
|
||||
def normalize(self, batch):
|
||||
if self.clip_values is not None:
|
||||
return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
|
||||
else:
|
||||
return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})
|
||||
350
rl_coach/base_parameters.py
Normal file
350
rl_coach/base_parameters.py
Normal file
@@ -0,0 +1,350 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from collections import OrderedDict
|
||||
from enum import Enum
|
||||
from typing import Dict, List, Union
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentSteps, GradientClippingMethod
|
||||
from rl_coach.filters.filter import NoInputFilter
|
||||
|
||||
|
||||
class Frameworks(Enum):
|
||||
tensorflow = "TensorFlow"
|
||||
|
||||
|
||||
class EmbedderScheme(Enum):
|
||||
Empty = "Empty"
|
||||
Shallow = "Shallow"
|
||||
Medium = "Medium"
|
||||
Deep = "Deep"
|
||||
|
||||
|
||||
class MiddlewareScheme(Enum):
|
||||
Empty = "Empty"
|
||||
Shallow = "Shallow"
|
||||
Medium = "Medium"
|
||||
Deep = "Deep"
|
||||
|
||||
|
||||
class EmbeddingMergerType(Enum):
|
||||
Concat = 0
|
||||
Sum = 1
|
||||
#ConcatDepthWise = 2
|
||||
#Multiply = 3
|
||||
|
||||
|
||||
def iterable_to_items(obj):
|
||||
if isinstance(obj, dict) or isinstance(obj, OrderedDict) or isinstance(obj, types.MappingProxyType):
|
||||
items = obj.items()
|
||||
elif isinstance(obj, list):
|
||||
items = enumerate(obj)
|
||||
else:
|
||||
raise ValueError("The given object is not a dict or a list")
|
||||
return items
|
||||
|
||||
|
||||
def unfold_dict_or_list(obj: Union[Dict, List, OrderedDict]):
|
||||
"""
|
||||
Recursively unfolds all the parameters in dictionaries and lists
|
||||
:param obj: a dictionary or list to unfold
|
||||
:return: the unfolded parameters dictionary
|
||||
"""
|
||||
parameters = OrderedDict()
|
||||
items = iterable_to_items(obj)
|
||||
for k, v in items:
|
||||
if isinstance(v, dict) or isinstance(v, list) or isinstance(v, OrderedDict):
|
||||
if 'tensorflow.' not in str(v.__class__):
|
||||
parameters[k] = unfold_dict_or_list(v)
|
||||
elif 'tensorflow.' in str(v.__class__):
|
||||
parameters[k] = v
|
||||
elif hasattr(v, '__dict__'):
|
||||
sub_params = v.__dict__
|
||||
if '__objclass__' not in sub_params.keys():
|
||||
try:
|
||||
parameters[k] = unfold_dict_or_list(sub_params)
|
||||
except RecursionError:
|
||||
parameters[k] = sub_params
|
||||
parameters[k]['__class__'] = v.__class__.__name__
|
||||
else:
|
||||
# unfolding this type of object will result in infinite recursion
|
||||
parameters[k] = sub_params
|
||||
else:
|
||||
parameters[k] = v
|
||||
if not isinstance(obj, OrderedDict) and not isinstance(obj, list):
|
||||
parameters = OrderedDict(sorted(parameters.items()))
|
||||
return parameters
|
||||
|
||||
|
||||
class Parameters(object):
|
||||
def __setattr__(self, key, value):
|
||||
caller_name = sys._getframe(1).f_code.co_name
|
||||
|
||||
if caller_name != '__init__' and not hasattr(self, key):
|
||||
raise TypeError("Parameter '{}' does not exist in {}. Parameters are only to be defined in a constructor of"
|
||||
" a class inheriting from Parameters. In order to explicitly register a new parameter "
|
||||
"outside of a constructor use register_var().".
|
||||
format(key, self.__class__))
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
if hasattr(self, 'parameterized_class_name'):
|
||||
module_path = os.path.relpath(inspect.getfile(self.__class__), os.getcwd())[:-3] + '.py'
|
||||
|
||||
return ':'.join([module_path, self.parameterized_class_name])
|
||||
else:
|
||||
raise ValueError("The parameters class does not have an attached class it parameterizes. "
|
||||
"The self.parameterized_class_name should be set to the parameterized class.")
|
||||
|
||||
def register_var(self, key, value):
|
||||
if hasattr(self, key):
|
||||
raise TypeError("Cannot register an already existing parameter '{}'. ".format(key))
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
def __str__(self):
|
||||
result = "\"{}\" {}\n".format(self.__class__.__name__,
|
||||
json.dumps(unfold_dict_or_list(self.__dict__), indent=4, default=repr))
|
||||
return result
|
||||
|
||||
|
||||
class AlgorithmParameters(Parameters):
|
||||
def __init__(self):
|
||||
# Architecture parameters
|
||||
self.use_accumulated_reward_as_measurement = False
|
||||
|
||||
# Agent parameters
|
||||
self.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps
|
||||
|
||||
self.heatup_using_network_decisions = False
|
||||
self.discount = 0.99
|
||||
self.apply_gradients_every_x_episodes = 5
|
||||
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
|
||||
self.rate_for_copying_weights_to_target = 1.0
|
||||
self.load_memory_from_file_path = None
|
||||
self.collect_new_data = True
|
||||
|
||||
# HRL / HER related params
|
||||
self.in_action_space = None
|
||||
|
||||
# distributed agents params
|
||||
self.share_statistics_between_workers = True
|
||||
|
||||
# intrinsic reward
|
||||
self.scale_external_reward_by_intrinsic_reward_value = False
|
||||
|
||||
|
||||
class PresetValidationParameters(Parameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
|
||||
# the form of different workers starting at different times, and getting different assignments of CPU
|
||||
# time from the OS.
|
||||
|
||||
# Testing parameters
|
||||
self.test = False
|
||||
self.min_reward_threshold = 0
|
||||
self.max_episodes_to_achieve_reward = 1
|
||||
self.num_workers = 1
|
||||
self.reward_test_level = None
|
||||
self.trace_test_levels = None
|
||||
self.trace_max_env_steps = 5000
|
||||
|
||||
|
||||
class NetworkParameters(Parameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.framework = Frameworks.tensorflow
|
||||
self.sess = None
|
||||
|
||||
# hardware parameters
|
||||
self.force_cpu = False
|
||||
|
||||
# distributed training options
|
||||
self.num_threads = 1
|
||||
self.synchronize_over_num_threads = 1
|
||||
self.distributed = False
|
||||
self.async_training = False
|
||||
self.shared_optimizer = True
|
||||
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
|
||||
|
||||
# regularization
|
||||
self.clip_gradients = None
|
||||
self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
|
||||
self.kl_divergence_constraint = None
|
||||
self.l2_regularization = 0
|
||||
|
||||
# learning rate
|
||||
self.learning_rate = 0.00025
|
||||
self.learning_rate_decay_rate = 0
|
||||
self.learning_rate_decay_steps = 0
|
||||
|
||||
# structure
|
||||
self.input_embedders_parameters = []
|
||||
self.embedding_merger_type = EmbeddingMergerType.Concat
|
||||
self.middleware_parameters = None
|
||||
self.heads_parameters = []
|
||||
self.num_output_head_copies = 1
|
||||
self.loss_weights = []
|
||||
self.rescale_gradient_from_head_by_factor = [1]
|
||||
self.use_separate_networks_per_head = False
|
||||
self.optimizer_type = 'Adam'
|
||||
self.optimizer_epsilon = 0.0001
|
||||
self.adam_optimizer_beta1 = 0.9
|
||||
self.adam_optimizer_beta2 = 0.99
|
||||
self.rms_prop_optimizer_decay = 0.9
|
||||
self.batch_size = 32
|
||||
self.replace_mse_with_huber_loss = False
|
||||
self.create_target_network = False
|
||||
|
||||
# Framework support
|
||||
self.tensorflow_support = True
|
||||
|
||||
|
||||
class InputEmbedderParameters(Parameters):
|
||||
def __init__(self, activation_function: str='relu', scheme: Union[List, EmbedderScheme]=EmbedderScheme.Medium,
|
||||
batchnorm: bool=False, dropout=False, name: str='embedder', input_rescaling=None, input_offset=None,
|
||||
input_clipping=None):
|
||||
super().__init__()
|
||||
self.activation_function = activation_function
|
||||
self.scheme = scheme
|
||||
self.batchnorm = batchnorm
|
||||
self.dropout = dropout
|
||||
|
||||
if input_rescaling is None:
|
||||
input_rescaling = {'image': 255.0, 'vector': 1.0}
|
||||
if input_offset is None:
|
||||
input_offset = {'image': 0.0, 'vector': 0.0}
|
||||
|
||||
self.input_rescaling = input_rescaling
|
||||
self.input_offset = input_offset
|
||||
self.input_clipping = input_clipping
|
||||
self.name = name
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return {
|
||||
"image": 'image_embedder:ImageEmbedder',
|
||||
"vector": 'vector_embedder:VectorEmbedder'
|
||||
}
|
||||
|
||||
|
||||
class VisualizationParameters(Parameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Visualization parameters
|
||||
self.print_summary = True
|
||||
self.dump_csv = True
|
||||
self.dump_gifs = False
|
||||
self.dump_mp4 = False
|
||||
self.dump_signals_to_csv_every_x_episodes = 5
|
||||
self.dump_in_episode_signals = False
|
||||
self.dump_parameters_documentation = True
|
||||
self.render = False
|
||||
self.native_rendering = False
|
||||
self.max_fps_for_human_control = 10
|
||||
self.tensorboard = False
|
||||
self.video_dump_methods = [] # a list of dump methods which will be checked one after the other until the first
|
||||
# dump method that returns false for should_dump()
|
||||
self.add_rendered_image_to_env_response = False
|
||||
|
||||
|
||||
class AgentParameters(Parameters):
|
||||
def __init__(self, algorithm: AlgorithmParameters, exploration: 'ExplorationParameters', memory: 'MemoryParameters',
|
||||
networks: Dict[str, NetworkParameters], visualization: VisualizationParameters=VisualizationParameters()):
|
||||
"""
|
||||
:param algorithm: the algorithmic parameters
|
||||
:param exploration: the exploration policy parameters
|
||||
:param memory: the memory module parameters
|
||||
:param networks: the parameters for the networks of the agent
|
||||
:param visualization: the visualization parameters
|
||||
"""
|
||||
super().__init__()
|
||||
self.visualization = visualization
|
||||
self.algorithm = algorithm
|
||||
self.exploration = exploration
|
||||
self.memory = memory
|
||||
self.network_wrappers = networks
|
||||
self.input_filter = None
|
||||
self.output_filter = None
|
||||
self.pre_network_filter = NoInputFilter()
|
||||
self.full_name_id = None # TODO: do we really want to hold this parameters here?
|
||||
self.name = None
|
||||
self.is_a_highest_level_agent = True
|
||||
self.is_a_lowest_level_agent = True
|
||||
self.task_parameters = None
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.agents.agent:Agent'
|
||||
|
||||
|
||||
class TaskParameters(Parameters):
|
||||
def __init__(self, framework_type: str, evaluate_only: bool=False, use_cpu: bool=False, experiment_path=None,
|
||||
seed=None):
|
||||
"""
|
||||
:param framework_type: deep learning framework type. currently only tensorflow is supported
|
||||
:param evaluate_only: the task will be used only for evaluating the model
|
||||
:param use_cpu: use the cpu for this task
|
||||
:param experiment_path: the path to the directory which will store all the experiment outputs
|
||||
:param seed: a seed to use for the random numbers generator
|
||||
"""
|
||||
self.framework_type = framework_type
|
||||
self.task_index = None # TODO: not really needed
|
||||
self.evaluate_only = evaluate_only
|
||||
self.use_cpu = use_cpu
|
||||
self.experiment_path = experiment_path
|
||||
self.seed = seed
|
||||
|
||||
|
||||
class DistributedTaskParameters(TaskParameters):
|
||||
def __init__(self, framework_type: str, parameters_server_hosts: str, worker_hosts: str, job_type: str,
|
||||
task_index: int, evaluate_only: bool=False, num_tasks: int=None,
|
||||
num_training_tasks: int=None, use_cpu: bool=False, experiment_path=None, dnd=None,
|
||||
shared_memory_scratchpad=None, seed=None):
|
||||
"""
|
||||
:param framework_type: deep learning framework type. currently only tensorflow is supported
|
||||
:param evaluate_only: the task will be used only for evaluating the model
|
||||
:param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are
|
||||
assigned
|
||||
:param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
|
||||
:param job_type: the job type - either ps (short for parameters server) or worker
|
||||
:param task_index: the index of the process
|
||||
:param num_tasks: the number of total tasks that are running (not including the parameters server)
|
||||
:param num_training_tasks: the number of tasks that are training (not including the parameters server)
|
||||
:param use_cpu: use the cpu for this task
|
||||
:param experiment_path: the path to the directory which will store all the experiment outputs
|
||||
:param dnd: an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.
|
||||
:param seed: a seed to use for the random numbers generator
|
||||
"""
|
||||
super().__init__(framework_type=framework_type, evaluate_only=evaluate_only, use_cpu=use_cpu,
|
||||
experiment_path=experiment_path, seed=seed)
|
||||
self.parameters_server_hosts = parameters_server_hosts
|
||||
self.worker_hosts = worker_hosts
|
||||
self.job_type = job_type
|
||||
self.task_index = task_index
|
||||
self.num_tasks = num_tasks
|
||||
self.num_training_tasks = num_training_tasks
|
||||
self.device = None # the replicated device which will be used for the global parameters
|
||||
self.worker_target = None
|
||||
self.dnd = dnd
|
||||
self.shared_memory_scratchpad = shared_memory_scratchpad
|
||||
402
rl_coach/coach.py
Normal file
402
rl_coach/coach.py
Normal file
@@ -0,0 +1,402 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import sys
|
||||
sys.path.append('.')
|
||||
|
||||
import copy
|
||||
from rl_coach.core_types import EnvironmentSteps
|
||||
import os
|
||||
from rl_coach import logger
|
||||
import traceback
|
||||
from rl_coach.logger import screen, failed_imports
|
||||
import argparse
|
||||
import atexit
|
||||
import time
|
||||
import sys
|
||||
from rl_coach.base_parameters import Frameworks, VisualizationParameters, TaskParameters, DistributedTaskParameters
|
||||
from multiprocessing import Process
|
||||
from multiprocessing.managers import BaseManager
|
||||
import subprocess
|
||||
from rl_coach.graph_managers.graph_manager import HumanPlayScheduleParameters, GraphManager
|
||||
from rl_coach.utils import list_all_presets, short_dynamic_import, get_open_port, SharedMemoryScratchPad, get_base_dir
|
||||
from rl_coach.agents.human_agent import HumanAgentParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.environments.environment import SingleLevelSelection
|
||||
|
||||
|
||||
if len(set(failed_imports)) > 0:
|
||||
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
|
||||
|
||||
|
||||
def get_graph_manager_from_args(args: argparse.Namespace) -> 'GraphManager':
|
||||
"""
|
||||
Return the graph manager according to the command line arguments given by the user
|
||||
:param args: the arguments given by the user
|
||||
:return: the updated graph manager
|
||||
"""
|
||||
|
||||
graph_manager = None
|
||||
|
||||
# if a preset was given we will load the graph manager for the preset
|
||||
if args.preset is not None:
|
||||
graph_manager = short_dynamic_import(args.preset, ignore_module_case=True)
|
||||
|
||||
# for human play we need to create a custom graph manager
|
||||
if args.play:
|
||||
env_params = short_dynamic_import(args.environment_type, ignore_module_case=True)()
|
||||
env_params.human_control = True
|
||||
schedule_params = HumanPlayScheduleParameters()
|
||||
graph_manager = BasicRLGraphManager(HumanAgentParameters(), env_params, schedule_params, VisualizationParameters())
|
||||
|
||||
if args.level:
|
||||
if isinstance(graph_manager.env_params.level, SingleLevelSelection):
|
||||
graph_manager.env_params.level.select(args.level)
|
||||
else:
|
||||
graph_manager.env_params.level = args.level
|
||||
|
||||
# set the seed for the environment
|
||||
if args.seed is not None:
|
||||
graph_manager.env_params.seed = args.seed
|
||||
|
||||
# visualization
|
||||
graph_manager.visualization_parameters.dump_gifs = graph_manager.visualization_parameters.dump_gifs or args.dump_gifs
|
||||
graph_manager.visualization_parameters.dump_mp4 = graph_manager.visualization_parameters.dump_mp4 or args.dump_mp4
|
||||
graph_manager.visualization_parameters.render = args.render
|
||||
graph_manager.visualization_parameters.tensorboard = args.tensorboard
|
||||
|
||||
# update the custom parameters
|
||||
if args.custom_parameter is not None:
|
||||
unstripped_key_value_pairs = [pair.split('=') for pair in args.custom_parameter.split(';')]
|
||||
stripped_key_value_pairs = [tuple([pair[0].strip(), pair[1].strip()]) for pair in
|
||||
unstripped_key_value_pairs if len(pair) == 2]
|
||||
|
||||
# load custom parameters into run_dict
|
||||
for key, value in stripped_key_value_pairs:
|
||||
exec("graph_manager.{}={}".format(key, value))
|
||||
|
||||
return graph_manager
|
||||
|
||||
|
||||
def parse_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
|
||||
"""
|
||||
Parse the arguments that the user entered
|
||||
:param parser: the argparse command line parser
|
||||
:return: the parsed arguments
|
||||
"""
|
||||
args = parser.parse_args()
|
||||
|
||||
# if no arg is given
|
||||
if len(sys.argv) == 1:
|
||||
parser.print_help()
|
||||
exit(0)
|
||||
|
||||
# list available presets
|
||||
preset_names = list_all_presets()
|
||||
if args.list:
|
||||
screen.log_title("Available Presets:")
|
||||
for preset in sorted(preset_names):
|
||||
print(preset)
|
||||
sys.exit(0)
|
||||
|
||||
# replace a short preset name with the full path
|
||||
if args.preset is not None:
|
||||
if args.preset.lower() in [p.lower() for p in preset_names]:
|
||||
args.preset = "{}.py:graph_manager".format(os.path.join(get_base_dir(), 'presets', args.preset))
|
||||
else:
|
||||
args.preset = "{}".format(args.preset)
|
||||
|
||||
# verify that the preset exists
|
||||
preset_path = args.preset.split(":")[0]
|
||||
if not os.path.exists(preset_path):
|
||||
screen.error("The given preset ({}) cannot be found.".format(args.preset))
|
||||
|
||||
# verify that the preset can be instantiated
|
||||
try:
|
||||
short_dynamic_import(args.preset, ignore_module_case=True)
|
||||
except TypeError as e:
|
||||
traceback.print_exc()
|
||||
screen.error('Internal Error: ' + str(e) + "\n\nThe given preset ({}) cannot be instantiated."
|
||||
.format(args.preset))
|
||||
|
||||
# validate the checkpoints args
|
||||
if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
|
||||
screen.error("The requested checkpoint folder to load from does not exist.")
|
||||
|
||||
# no preset was given. check if the user requested to play some environment on its own
|
||||
if args.preset is None and args.play:
|
||||
if args.environment_type:
|
||||
args.agent_type = 'Human'
|
||||
else:
|
||||
screen.error('When no preset is given for Coach to run, and the user requests human control over '
|
||||
'the environment, the user is expected to input the desired environment_type and level.'
|
||||
'\nAt least one of these parameters was not given.')
|
||||
elif args.preset and args.play:
|
||||
screen.error("Both the --preset and the --play flags were set. These flags can not be used together. "
|
||||
"For human control, please use the --play flag together with the environment type flag (-et)")
|
||||
elif args.preset is None and not args.play:
|
||||
screen.error("Please choose a preset using the -p flag or use the --play flag together with choosing an "
|
||||
"environment type (-et) in order to play the game.")
|
||||
|
||||
# get experiment name and path
|
||||
args.experiment_name = logger.get_experiment_name(args.experiment_name)
|
||||
args.experiment_path = logger.get_experiment_path(args.experiment_name)
|
||||
|
||||
if args.play and args.num_workers > 1:
|
||||
screen.warning("Playing the game as a human is only available with a single worker. "
|
||||
"The number of workers will be reduced to 1")
|
||||
args.num_workers = 1
|
||||
|
||||
args.framework = Frameworks[args.framework.lower()]
|
||||
|
||||
# checkpoints
|
||||
args.save_checkpoint_dir = os.path.join(args.experiment_path, 'checkpoint') if args.save_checkpoint_secs is not None else None
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def add_items_to_dict(target_dict, source_dict):
|
||||
updated_task_parameters = copy.copy(source_dict)
|
||||
updated_task_parameters.update(target_dict)
|
||||
return updated_task_parameters
|
||||
|
||||
|
||||
def open_dashboard(experiment_path):
|
||||
dashboard_path = 'python {}/dashboard.py'.format(get_base_dir())
|
||||
cmd = "{} --experiment_dir {}".format(dashboard_path, experiment_path)
|
||||
screen.log_title("Opening dashboard - experiment path: {}".format(experiment_path))
|
||||
# subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True, executable="/bin/bash")
|
||||
subprocess.Popen(cmd, shell=True, executable="/bin/bash")
|
||||
|
||||
|
||||
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'):
|
||||
graph_manager.create_graph(task_parameters)
|
||||
|
||||
# let the adventure begin
|
||||
if task_parameters.evaluate_only:
|
||||
graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True)
|
||||
else:
|
||||
graph_manager.improve()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-p', '--preset',
|
||||
help="(string) Name of a preset to run (class name from the 'presets' directory.)",
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-l', '--list',
|
||||
help="(flag) List all available presets",
|
||||
action='store_true')
|
||||
parser.add_argument('-e', '--experiment_name',
|
||||
help="(string) Experiment name to be used to store the results.",
|
||||
default='',
|
||||
type=str)
|
||||
parser.add_argument('-r', '--render',
|
||||
help="(flag) Render environment",
|
||||
action='store_true')
|
||||
parser.add_argument('-f', '--framework',
|
||||
help="(string) Neural network framework. Available values: tensorflow",
|
||||
default='tensorflow',
|
||||
type=str)
|
||||
parser.add_argument('-n', '--num_workers',
|
||||
help="(int) Number of workers for multi-process based agents, e.g. A3C",
|
||||
default=1,
|
||||
type=int)
|
||||
parser.add_argument('-c', '--use_cpu',
|
||||
help="(flag) Use only the cpu for training. If a GPU is not available, this flag will have no "
|
||||
"effect and the CPU will be used either way.",
|
||||
action='store_true')
|
||||
parser.add_argument('-ew', '--evaluation_worker',
|
||||
help="(int) If multiple workers are used, add an evaluation worker as well which will "
|
||||
"evaluate asynchronously and independently during the training. NOTE: this worker will "
|
||||
"ignore the evaluation settings in the preset's ScheduleParams.",
|
||||
action='store_true')
|
||||
parser.add_argument('--play',
|
||||
help="(flag) Play as a human by controlling the game with the keyboard. "
|
||||
"This option will save a replay buffer with the game play.",
|
||||
action='store_true')
|
||||
parser.add_argument('--evaluate',
|
||||
help="(flag) Run evaluation only. This is a convenient way to disable "
|
||||
"training in order to evaluate an existing checkpoint.",
|
||||
action='store_true')
|
||||
parser.add_argument('-v', '--verbosity',
|
||||
help="(flag) Sets the verbosity level of Coach print outs. Can be either low or high.",
|
||||
default="low",
|
||||
type=str)
|
||||
parser.add_argument('-tfv', '--tf_verbosity',
|
||||
help="(flag) TensorFlow verbosity level",
|
||||
default=3,
|
||||
type=int)
|
||||
parser.add_argument('-s', '--save_checkpoint_secs',
|
||||
help="(int) Time in seconds between saving checkpoints of the model.",
|
||||
default=None,
|
||||
type=int)
|
||||
parser.add_argument('-crd', '--checkpoint_restore_dir',
|
||||
help='(string) Path to a folder containing a checkpoint to restore the model from.',
|
||||
type=str)
|
||||
parser.add_argument('-dg', '--dump_gifs',
|
||||
help="(flag) Enable the gif saving functionality.",
|
||||
action='store_true')
|
||||
parser.add_argument('-dm', '--dump_mp4',
|
||||
help="(flag) Enable the mp4 saving functionality.",
|
||||
action='store_true')
|
||||
parser.add_argument('-at', '--agent_type',
|
||||
help="(string) Choose an agent type class to override on top of the selected preset. "
|
||||
"If no preset is defined, a preset can be set from the command-line by combining settings "
|
||||
"which are set by using --agent_type, --experiment_type, --environemnt_type",
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-et', '--environment_type',
|
||||
help="(string) Choose an environment type class to override on top of the selected preset."
|
||||
"If no preset is defined, a preset can be set from the command-line by combining settings "
|
||||
"which are set by using --agent_type, --experiment_type, --environemnt_type",
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-ept', '--exploration_policy_type',
|
||||
help="(string) Choose an exploration policy type class to override on top of the selected "
|
||||
"preset."
|
||||
"If no preset is defined, a preset can be set from the command-line by combining settings "
|
||||
"which are set by using --agent_type, --experiment_type, --environemnt_type"
|
||||
,
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-lvl', '--level',
|
||||
help="(string) Choose the level that will be played in the environment that was selected."
|
||||
"This value will override the level parameter in the environment class."
|
||||
,
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-cp', '--custom_parameter',
|
||||
help="(string) Semicolon separated parameters used to override specific parameters on top of"
|
||||
" the selected preset (or on top of the command-line assembled one). "
|
||||
"Whenever a parameter value is a string, it should be inputted as '\\\"string\\\"'. "
|
||||
"For ex.: "
|
||||
"\"visualization.render=False; num_training_iterations=500; optimizer='rmsprop'\"",
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('--print_parameters',
|
||||
help="(flag) Print tuning_parameters to stdout",
|
||||
action='store_true')
|
||||
parser.add_argument('-tb', '--tensorboard',
|
||||
help="(flag) When using the TensorFlow backend, enable TensorBoard log dumps. ",
|
||||
action='store_true')
|
||||
parser.add_argument('-ns', '--no_summary',
|
||||
help="(flag) Prevent Coach from printing a summary and asking questions at the end of runs",
|
||||
action='store_true')
|
||||
parser.add_argument('-d', '--open_dashboard',
|
||||
help="(flag) Open dashboard with the experiment when the run starts",
|
||||
action='store_true')
|
||||
parser.add_argument('--seed',
|
||||
help="(int) A seed to use for running the experiment",
|
||||
default=None,
|
||||
type=int)
|
||||
|
||||
args = parse_arguments(parser)
|
||||
|
||||
graph_manager = get_graph_manager_from_args(args)
|
||||
|
||||
# Intel optimized TF seems to run significantly faster when limiting to a single OMP thread.
|
||||
# This will not affect GPU runs.
|
||||
os.environ["OMP_NUM_THREADS"] = "1"
|
||||
|
||||
# turn TF debug prints off
|
||||
if args.framework == Frameworks.tensorflow:
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_verbosity)
|
||||
|
||||
# turn off the summary at the end of the run if necessary
|
||||
if not args.no_summary:
|
||||
atexit.register(logger.summarize_experiment)
|
||||
screen.change_terminal_title(args.experiment_name)
|
||||
|
||||
# open dashboard
|
||||
if args.open_dashboard:
|
||||
open_dashboard(args.experiment_path)
|
||||
|
||||
# Single-threaded runs
|
||||
if args.num_workers == 1:
|
||||
# Start the training or evaluation
|
||||
task_parameters = TaskParameters(framework_type="tensorflow", # TODO: tensorflow should'nt be hardcoded
|
||||
evaluate_only=args.evaluate,
|
||||
experiment_path=args.experiment_path,
|
||||
seed=args.seed,
|
||||
use_cpu=args.use_cpu)
|
||||
task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
|
||||
|
||||
start_graph(graph_manager=graph_manager, task_parameters=task_parameters)
|
||||
|
||||
# Multi-threaded runs
|
||||
else:
|
||||
total_tasks = args.num_workers
|
||||
if args.evaluation_worker:
|
||||
total_tasks += 1
|
||||
|
||||
ps_hosts = "localhost:{}".format(get_open_port())
|
||||
worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(total_tasks)])
|
||||
|
||||
# Shared memory
|
||||
class CommManager(BaseManager):
|
||||
pass
|
||||
CommManager.register('SharedMemoryScratchPad', SharedMemoryScratchPad, exposed=['add', 'get', 'internal_call'])
|
||||
comm_manager = CommManager()
|
||||
comm_manager.start()
|
||||
shared_memory_scratchpad = comm_manager.SharedMemoryScratchPad()
|
||||
|
||||
def start_distributed_task(job_type, task_index, evaluation_worker=False,
|
||||
shared_memory_scratchpad=shared_memory_scratchpad):
|
||||
task_parameters = DistributedTaskParameters(framework_type="tensorflow", # TODO: tensorflow should'nt be hardcoded
|
||||
parameters_server_hosts=ps_hosts,
|
||||
worker_hosts=worker_hosts,
|
||||
job_type=job_type,
|
||||
task_index=task_index,
|
||||
evaluate_only=evaluation_worker,
|
||||
use_cpu=args.use_cpu,
|
||||
num_tasks=total_tasks, # training tasks + 1 evaluation task
|
||||
num_training_tasks=args.num_workers,
|
||||
experiment_path=args.experiment_path,
|
||||
shared_memory_scratchpad=shared_memory_scratchpad,
|
||||
seed=args.seed+task_index if args.seed is not None else None) # each worker gets a different seed
|
||||
task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
|
||||
# we assume that only the evaluation workers are rendering
|
||||
graph_manager.visualization_parameters.render = args.render and evaluation_worker
|
||||
p = Process(target=start_graph, args=(graph_manager, task_parameters))
|
||||
# p.daemon = True
|
||||
p.start()
|
||||
return p
|
||||
|
||||
# parameter server
|
||||
parameter_server = start_distributed_task("ps", 0)
|
||||
|
||||
# training workers
|
||||
# wait a bit before spawning the non chief workers in order to make sure the session is already created
|
||||
workers = []
|
||||
workers.append(start_distributed_task("worker", 0))
|
||||
time.sleep(2)
|
||||
for task_index in range(1, args.num_workers):
|
||||
workers.append(start_distributed_task("worker", task_index))
|
||||
|
||||
# evaluation worker
|
||||
if args.evaluation_worker:
|
||||
evaluation_worker = start_distributed_task("worker", args.num_workers, evaluation_worker=True)
|
||||
|
||||
# wait for all workers
|
||||
[w.join() for w in workers]
|
||||
if args.evaluation_worker:
|
||||
evaluation_worker.terminate()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
687
rl_coach/core_types.py
Normal file
687
rl_coach/core_types.py
Normal file
@@ -0,0 +1,687 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Union, Dict, Any, Type
|
||||
from random import shuffle
|
||||
|
||||
import numpy as np
|
||||
import copy
|
||||
|
||||
ActionType = Union[int, float, np.ndarray, List]
|
||||
GoalType = Union[None, np.ndarray]
|
||||
ObservationType = np.ndarray
|
||||
RewardType = Union[int, float, np.ndarray]
|
||||
StateType = Dict[str, np.ndarray]
|
||||
|
||||
|
||||
class GoalTypes(Enum):
|
||||
Embedding = 1
|
||||
EmbeddingChange = 2
|
||||
Observation = 3
|
||||
Measurements = 4
|
||||
|
||||
|
||||
# step methods
|
||||
|
||||
class StepMethod(object):
|
||||
def __init__(self, num_steps: int):
|
||||
self._num_steps = self.num_steps = num_steps
|
||||
|
||||
@property
|
||||
def num_steps(self) -> int:
|
||||
return self._num_steps
|
||||
|
||||
@num_steps.setter
|
||||
def num_steps(self, val: int) -> None:
|
||||
self._num_steps = val
|
||||
|
||||
|
||||
class Frames(StepMethod):
|
||||
def __init__(self, num_steps):
|
||||
super().__init__(num_steps)
|
||||
|
||||
|
||||
class EnvironmentSteps(StepMethod):
|
||||
def __init__(self, num_steps):
|
||||
super().__init__(num_steps)
|
||||
|
||||
|
||||
class EnvironmentEpisodes(StepMethod):
|
||||
def __init__(self, num_steps):
|
||||
super().__init__(num_steps)
|
||||
|
||||
|
||||
class TrainingSteps(StepMethod):
|
||||
def __init__(self, num_steps):
|
||||
super().__init__(num_steps)
|
||||
|
||||
|
||||
class Time(StepMethod):
|
||||
def __init__(self, num_steps):
|
||||
super().__init__(num_steps)
|
||||
|
||||
|
||||
class PredictionType(object):
|
||||
pass
|
||||
|
||||
|
||||
class VStateValue(PredictionType):
|
||||
pass
|
||||
|
||||
|
||||
class QActionStateValue(PredictionType):
|
||||
pass
|
||||
|
||||
|
||||
class ActionProbabilities(PredictionType):
|
||||
pass
|
||||
|
||||
|
||||
class Embedding(PredictionType):
|
||||
pass
|
||||
|
||||
|
||||
class InputEmbedding(Embedding):
|
||||
pass
|
||||
|
||||
|
||||
class MiddlewareEmbedding(Embedding):
|
||||
pass
|
||||
|
||||
|
||||
class InputImageEmbedding(InputEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
class InputVectorEmbedding(InputEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
class Middleware_FC_Embedding(MiddlewareEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
class Middleware_LSTM_Embedding(MiddlewareEmbedding):
|
||||
pass
|
||||
|
||||
|
||||
class Measurements(PredictionType):
|
||||
pass
|
||||
|
||||
PlayingStepsType = Union[EnvironmentSteps, EnvironmentEpisodes, Frames]
|
||||
|
||||
|
||||
# run phases
|
||||
class RunPhase(Enum):
|
||||
HEATUP = "Heatup"
|
||||
TRAIN = "Training"
|
||||
TEST = "Testing"
|
||||
UNDEFINED = "Undefined"
|
||||
|
||||
|
||||
# transitions
|
||||
|
||||
class Transition(object):
|
||||
def __init__(self, state: Dict[str, np.ndarray]=None, action: ActionType=None, reward: RewardType=None,
|
||||
next_state: Dict[str, np.ndarray]=None, game_over: bool=None, info: Dict=None):
|
||||
"""
|
||||
A transition is a tuple containing the information of a single step of interaction
|
||||
between the agent and the environment. The most basic version should contain the following values:
|
||||
(current state, action, reward, next state, game over)
|
||||
For imitation learning algorithms, if the reward, next state or game over is not known,
|
||||
it is sufficient to store the current state and action taken by the expert.
|
||||
|
||||
:param state: The current state. Assumed to be a dictionary where the observation
|
||||
is located at state['observation']
|
||||
:param action: The current action that was taken
|
||||
:param reward: The reward received from the environment
|
||||
:param next_state: The next state of the environment after applying the action.
|
||||
The next state should be similar to the state in its structure.
|
||||
:param game_over: A boolean which should be True if the episode terminated after
|
||||
the execution of the action.
|
||||
:param info: A dictionary containing any additional information to be stored in the transition
|
||||
"""
|
||||
|
||||
self._state = self.state = state
|
||||
self._action = self.action = action
|
||||
self._reward = self.reward = reward
|
||||
self._total_return = self.total_return = None
|
||||
if not next_state:
|
||||
next_state = state
|
||||
self._next_state = self._next_state = next_state
|
||||
self._game_over = self.game_over = game_over
|
||||
if info is None:
|
||||
self.info = {}
|
||||
else:
|
||||
self.info = info
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.__dict__)
|
||||
|
||||
@property
|
||||
def state(self):
|
||||
if self._state is None:
|
||||
raise Exception("The state was not filled by any of the modules between the environment and the agent")
|
||||
return self._state
|
||||
|
||||
@state.setter
|
||||
def state(self, val):
|
||||
self._state = val
|
||||
|
||||
@property
|
||||
def action(self):
|
||||
if self._action is None:
|
||||
raise Exception("The action was not filled by any of the modules between the environment and the agent")
|
||||
return self._action
|
||||
|
||||
@action.setter
|
||||
def action(self, val):
|
||||
self._action = val
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
|
||||
if self._reward is None:
|
||||
raise Exception("The reward was not filled by any of the modules between the environment and the agent")
|
||||
return self._reward
|
||||
|
||||
@reward.setter
|
||||
def reward(self, val):
|
||||
self._reward = val
|
||||
|
||||
@property
|
||||
def total_return(self):
|
||||
if self._total_return is None:
|
||||
raise Exception("The total_return was not filled by any of the modules between the environment and the "
|
||||
"agent. Make sure that you are using an episodic experience replay.")
|
||||
return self._total_return
|
||||
|
||||
@total_return.setter
|
||||
def total_return(self, val):
|
||||
self._total_return = val
|
||||
|
||||
@property
|
||||
def game_over(self):
|
||||
if self._game_over is None:
|
||||
raise Exception("The done flag was not filled by any of the modules between the environment and the agent")
|
||||
return self._game_over
|
||||
|
||||
@game_over.setter
|
||||
def game_over(self, val):
|
||||
self._game_over = val
|
||||
|
||||
@property
|
||||
def next_state(self):
|
||||
if self._next_state is None:
|
||||
raise Exception("The next state was not filled by any of the modules between the environment and the agent")
|
||||
return self._next_state
|
||||
|
||||
@next_state.setter
|
||||
def next_state(self, val):
|
||||
self._next_state = val
|
||||
|
||||
def add_info(self, new_info: Dict[str, Any]) -> None:
|
||||
if not new_info.keys().isdisjoint(self.info.keys()):
|
||||
raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there "
|
||||
"are overlapping keys between the two. old keys: {}, new keys: {}"
|
||||
.format(self.info.keys(), new_info.keys()))
|
||||
self.info.update(new_info)
|
||||
|
||||
def __copy__(self):
|
||||
new_transition = type(self)()
|
||||
new_transition.__dict__.update(self.__dict__)
|
||||
new_transition.state = copy.copy(new_transition.state)
|
||||
new_transition.next_state = copy.copy(new_transition.next_state)
|
||||
new_transition.info = copy.copy(new_transition.info)
|
||||
return new_transition
|
||||
|
||||
|
||||
class EnvResponse(object):
|
||||
def __init__(self, next_state: Dict[str, ObservationType], reward: RewardType, game_over: bool, info: Dict=None,
|
||||
goal: ObservationType=None):
|
||||
"""
|
||||
An env response is a collection containing the information returning from the environment after a single action
|
||||
has been performed on it.
|
||||
:param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
|
||||
observation is located at state['observation']
|
||||
:param reward: The reward received from the environment
|
||||
:param game_over: A boolean which should be True if the episode terminated after
|
||||
the execution of the action.
|
||||
:param info: any additional info from the environment
|
||||
:param goal: a goal defined by the environment
|
||||
"""
|
||||
self._next_state = self.next_state = next_state
|
||||
self._reward = self.reward = reward
|
||||
self._game_over = self.game_over = game_over
|
||||
self._goal = self.goal = goal
|
||||
if info is None:
|
||||
self.info = {}
|
||||
else:
|
||||
self.info = info
|
||||
|
||||
def __repr__(self):
|
||||
return str(self.__dict__)
|
||||
|
||||
@property
|
||||
def next_state(self):
|
||||
return self._next_state
|
||||
|
||||
@next_state.setter
|
||||
def next_state(self, val):
|
||||
self._next_state = val
|
||||
|
||||
@property
|
||||
def reward(self):
|
||||
return self._reward
|
||||
|
||||
@reward.setter
|
||||
def reward(self, val):
|
||||
self._reward = val
|
||||
|
||||
@property
|
||||
def game_over(self):
|
||||
return self._game_over
|
||||
|
||||
@game_over.setter
|
||||
def game_over(self, val):
|
||||
self._game_over = val
|
||||
|
||||
@property
|
||||
def goal(self):
|
||||
return self._goal
|
||||
|
||||
@goal.setter
|
||||
def goal(self, val):
|
||||
self._goal = val
|
||||
|
||||
def add_info(self, info: Dict[str, Any]) -> None:
|
||||
if info.keys().isdisjoint(self.info.keys()):
|
||||
raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there"
|
||||
"are overlapping keys between the two")
|
||||
self.info.update(info)
|
||||
|
||||
|
||||
class ActionInfo(object):
|
||||
"""
|
||||
Action info is a class that holds an action and various additional information details about it
|
||||
"""
|
||||
def __init__(self, action: ActionType, action_probability: float=0,
|
||||
action_value: float=0., state_value: float=0., max_action_value: float=None,
|
||||
action_intrinsic_reward: float=0):
|
||||
"""
|
||||
:param action: the action
|
||||
:param action_probability: the probability that the action was given when selecting it
|
||||
:param action_value: the state-action value (Q value) of the action
|
||||
:param state_value: the state value (V value) of the state where the action was taken
|
||||
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action
|
||||
that received the maximum value. if no value is given, the action is assumed to be the
|
||||
action with the maximum value
|
||||
:param action_intrinsic_reward: can contain any intrinsic reward that the agent wants to add to this action
|
||||
selection
|
||||
"""
|
||||
self.action = action
|
||||
self.action_probability = action_probability
|
||||
self.action_value = action_value
|
||||
self.state_value = state_value
|
||||
if not max_action_value:
|
||||
self.max_action_value = action_value
|
||||
else:
|
||||
self.max_action_value = max_action_value
|
||||
self.action_intrinsic_reward = action_intrinsic_reward
|
||||
|
||||
|
||||
class Batch(object):
|
||||
def __init__(self, transitions: List[Transition]):
|
||||
"""
|
||||
A wrapper around a list of transitions that helps extracting batches of parameters from it.
|
||||
For example, one can extract a list of states corresponding to the list of transitions.
|
||||
The class uses lazy evaluation in order to return each of the available parameters.
|
||||
:param transitions: a list of transitions to extract the batch from
|
||||
"""
|
||||
self.transitions = transitions
|
||||
self._states = {}
|
||||
self._actions = None
|
||||
self._rewards = None
|
||||
self._total_returns = None
|
||||
self._game_overs = None
|
||||
self._next_states = {}
|
||||
self._goals = None
|
||||
self._info = {}
|
||||
|
||||
def slice(self, start, end) -> None:
|
||||
"""
|
||||
Keep a slice from the batch and discard the rest of the batch
|
||||
:param start: the start index in the slice
|
||||
:param end: the end index in the slice
|
||||
:return: None
|
||||
"""
|
||||
|
||||
self.transitions = self.transitions[start:end]
|
||||
for k, v in self._states.items():
|
||||
self._states[k] = v[start:end]
|
||||
if self._actions is not None:
|
||||
self._actions = self._actions[start:end]
|
||||
if self._rewards is not None:
|
||||
self._rewards = self._rewards[start:end]
|
||||
if self._total_returns is not None:
|
||||
self._total_returns = self._total_returns[start:end]
|
||||
if self._game_overs is not None:
|
||||
self._game_overs = self._game_overs[start:end]
|
||||
for k, v in self._next_states.items():
|
||||
self._next_states[k] = v[start:end]
|
||||
if self._goals is not None:
|
||||
self._goals = self._goals[start:end]
|
||||
for k, v in self._info.items():
|
||||
self._info[k] = v[start:end]
|
||||
|
||||
def shuffle(self) -> None:
|
||||
"""
|
||||
Shuffle all the transitions in the batch
|
||||
:return: None
|
||||
"""
|
||||
batch_order = list(range(self.size))
|
||||
shuffle(batch_order)
|
||||
self.transitions = [self.transitions[i] for i in batch_order]
|
||||
self._states = {}
|
||||
self._actions = None
|
||||
self._rewards = None
|
||||
self._total_returns = None
|
||||
self._game_overs = None
|
||||
self._next_states = {}
|
||||
self._goals = None
|
||||
self._info = {}
|
||||
|
||||
# This seems to be slower
|
||||
# for k, v in self._states.items():
|
||||
# self._states[k] = [v[i] for i in batch_order]
|
||||
# if self._actions is not None:
|
||||
# self._actions = [self._actions[i] for i in batch_order]
|
||||
# if self._rewards is not None:
|
||||
# self._rewards = [self._rewards[i] for i in batch_order]
|
||||
# if self._total_returns is not None:
|
||||
# self._total_returns = [self._total_returns[i] for i in batch_order]
|
||||
# if self._game_overs is not None:
|
||||
# self._game_overs = [self._game_overs[i] for i in batch_order]
|
||||
# for k, v in self._next_states.items():
|
||||
# self._next_states[k] = [v[i] for i in batch_order]
|
||||
# if self._goals is not None:
|
||||
# self._goals = [self._goals[i] for i in batch_order]
|
||||
# for k, v in self._info.items():
|
||||
# self._info[k] = [v[i] for i in batch_order]
|
||||
|
||||
def states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
|
||||
"""
|
||||
follow the keys in fetches to extract the corresponding items from the states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys
|
||||
:param fetches: the keys of the state dictionary to extract
|
||||
:param expand_dims: add an extra dimension to each of the value batches
|
||||
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
|
||||
"""
|
||||
current_states = {}
|
||||
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
|
||||
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
|
||||
# addition to the current_state, so that all the inputs of the network will be filled)
|
||||
for key in set(fetches).intersection(self.transitions[0].state.keys()):
|
||||
if key not in self._states.keys():
|
||||
self._states[key] = np.array([np.array(transition.state[key]) for transition in self.transitions])
|
||||
if expand_dims:
|
||||
current_states[key] = np.expand_dims(self._states[key], -1)
|
||||
else:
|
||||
current_states[key] = self._states[key]
|
||||
return current_states
|
||||
|
||||
def actions(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the actions were not converted to a batch before, extract them to a batch and then return the batch
|
||||
:param expand_dims: add an extra dimension to the actions batch
|
||||
:return: a numpy array containing all the actions of the batch
|
||||
"""
|
||||
if self._actions is None:
|
||||
self._actions = np.array([transition.action for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._actions, -1)
|
||||
return self._actions
|
||||
|
||||
def rewards(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the rewards were not converted to a batch before, extract them to a batch and then return the batch
|
||||
:param expand_dims: add an extra dimension to the rewards batch
|
||||
:return: a numpy array containing all the rewards of the batch
|
||||
"""
|
||||
if self._rewards is None:
|
||||
self._rewards = np.array([transition.reward for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._rewards, -1)
|
||||
return self._rewards
|
||||
|
||||
def total_returns(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the total_returns were not converted to a batch before, extract them to a batch and then return the batch
|
||||
if the total return was not filled, this will raise an exception
|
||||
:param expand_dims: add an extra dimension to the total_returns batch
|
||||
:return: a numpy array containing all the total return values of the batch
|
||||
"""
|
||||
if self._total_returns is None:
|
||||
self._total_returns = np.array([transition.total_return for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._total_returns, -1)
|
||||
return self._total_returns
|
||||
|
||||
def game_overs(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
|
||||
:param expand_dims: add an extra dimension to the game_overs batch
|
||||
:return: a numpy array containing all the game over flags of the batch
|
||||
"""
|
||||
if self._game_overs is None:
|
||||
self._game_overs = np.array([transition.game_over for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._game_overs, -1)
|
||||
return self._game_overs
|
||||
|
||||
def next_states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
|
||||
"""
|
||||
follow the keys in fetches to extract the corresponding items from the next states in the batch
|
||||
if these keys were not already extracted before. return only the values corresponding to those keys
|
||||
:param fetches: the keys of the state dictionary to extract
|
||||
:param expand_dims: add an extra dimension to each of the value batches
|
||||
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
|
||||
"""
|
||||
next_states = {}
|
||||
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
|
||||
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
|
||||
# addition to the current_state, so that all the inputs of the network will be filled)
|
||||
for key in set(fetches).intersection(self.transitions[0].next_state.keys()):
|
||||
if key not in self._next_states.keys():
|
||||
self._next_states[key] = np.array([np.array(transition.next_state[key]) for transition in self.transitions])
|
||||
if expand_dims:
|
||||
next_states[key] = np.expand_dims(self._next_states[key], -1)
|
||||
else:
|
||||
next_states[key] = self._next_states[key]
|
||||
return next_states
|
||||
|
||||
def goals(self, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the goals were not converted to a batch before, extract them to a batch and then return the batch
|
||||
if the goal was not filled, this will raise an exception
|
||||
:param expand_dims: add an extra dimension to the goals batch
|
||||
:return: a numpy array containing all the goals of the batch
|
||||
"""
|
||||
if self._goals is None:
|
||||
self._goals = np.array([transition.goal for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._goals, -1)
|
||||
return self._goals
|
||||
|
||||
def info(self, key, expand_dims=False) -> np.ndarray:
|
||||
"""
|
||||
if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
|
||||
batch. if the key is not part of the keys in the info dictionary, this will raise an exception
|
||||
:param expand_dims: add an extra dimension to the info batch
|
||||
:return: a numpy array containing all the info values of the batch corresponding to the given key
|
||||
"""
|
||||
if key not in self._info.keys():
|
||||
self._info[key] = np.array([transition.info[key] for transition in self.transitions])
|
||||
if expand_dims:
|
||||
return np.expand_dims(self._info[key], -1)
|
||||
return self._info[key]
|
||||
|
||||
@property
|
||||
def size(self) -> int:
|
||||
"""
|
||||
:return: the size of the batch
|
||||
"""
|
||||
return len(self.transitions)
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
get an item from the transitions list
|
||||
:param key: index of the transition in the batch
|
||||
:return: the transition corresponding to the given index
|
||||
"""
|
||||
return self.transitions[key]
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
"""
|
||||
set an item in the transition list
|
||||
:param key: index of the transition in the batch
|
||||
:param item: the transition to place in the given index
|
||||
:return: None
|
||||
"""
|
||||
self.transitions[key] = item
|
||||
|
||||
|
||||
class TotalStepsCounter(object):
|
||||
"""
|
||||
A wrapper around a dictionary counting different StepMethods steps done.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.counters = {
|
||||
EnvironmentEpisodes: 0,
|
||||
EnvironmentSteps: 0,
|
||||
TrainingSteps: 0
|
||||
}
|
||||
|
||||
def __getitem__(self, key: Type[StepMethod]) -> int:
|
||||
"""
|
||||
get counter value
|
||||
:param key: counter type
|
||||
:return: the counter value
|
||||
"""
|
||||
return self.counters[key]
|
||||
|
||||
def __setitem__(self, key: StepMethod, item: int) -> None:
|
||||
"""
|
||||
set an item in the transition list
|
||||
:param key: counter type
|
||||
:param item: an integer representing the new counter value
|
||||
:return: None
|
||||
"""
|
||||
self.counters[key] = item
|
||||
|
||||
|
||||
class GradientClippingMethod(Enum):
|
||||
ClipByGlobalNorm = 0
|
||||
ClipByNorm = 1
|
||||
ClipByValue = 2
|
||||
|
||||
|
||||
class Episode(object):
|
||||
def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
|
||||
"""
|
||||
:param discount: the discount factor to use when calculating total returns
|
||||
:param bootstrap_total_return_from_old_policy: should the total return be bootstrapped from the values in the
|
||||
memory
|
||||
:param n_step: the number of future steps to sum the reward over before bootstrapping
|
||||
"""
|
||||
self.transitions = []
|
||||
# a num_transitions x num_transitions table with the n step return in the n'th row
|
||||
self.returns_table = None
|
||||
self._length = 0
|
||||
self.discount = discount
|
||||
self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
|
||||
self.n_step = n_step
|
||||
self.is_complete = False
|
||||
|
||||
def insert(self, transition):
|
||||
self.transitions.append(transition)
|
||||
self._length += 1
|
||||
|
||||
def is_empty(self):
|
||||
return self.length() == 0
|
||||
|
||||
def length(self):
|
||||
return self._length
|
||||
|
||||
def get_transition(self, transition_idx):
|
||||
return self.transitions[transition_idx]
|
||||
|
||||
def get_last_transition(self):
|
||||
return self.get_transition(-1) if self.length() > 0 else None
|
||||
|
||||
def get_first_transition(self):
|
||||
return self.get_transition(0) if self.length() > 0 else None
|
||||
|
||||
def update_returns(self):
|
||||
if self.n_step == -1 or self.n_step > self.length():
|
||||
self.n_step = self.length()
|
||||
rewards = np.array([t.reward for t in self.transitions])
|
||||
rewards = rewards.astype('float')
|
||||
total_return = rewards.copy()
|
||||
current_discount = self.discount
|
||||
for i in range(1, self.n_step):
|
||||
total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0)
|
||||
current_discount *= self.discount
|
||||
|
||||
# calculate the bootstrapped returns
|
||||
if self.bootstrap_total_return_from_old_policy:
|
||||
bootstraps = np.array([np.squeeze(t.info['max_action_value']) for t in self.transitions[self.n_step:]])
|
||||
bootstrapped_return = total_return + current_discount * np.pad(bootstraps, (0, self.n_step), 'constant',
|
||||
constant_values=0)
|
||||
total_return = bootstrapped_return
|
||||
|
||||
for transition_idx in range(self.length()):
|
||||
self.transitions[transition_idx].total_return = total_return[transition_idx]
|
||||
|
||||
def update_actions_probabilities(self):
|
||||
probability_product = 1
|
||||
for transition_idx, transition in enumerate(self.transitions):
|
||||
if 'action_probabilities' in transition.info.keys():
|
||||
probability_product *= transition.info['action_probabilities']
|
||||
for transition_idx, transition in enumerate(self.transitions):
|
||||
transition.info['probability_product'] = probability_product
|
||||
|
||||
def get_returns_table(self):
|
||||
return self.returns_table
|
||||
|
||||
def get_returns(self):
|
||||
return self.get_transitions_attribute('total_return')
|
||||
|
||||
def get_transitions_attribute(self, attribute_name):
|
||||
if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
|
||||
return [getattr(t, attribute_name) for t in self.transitions]
|
||||
elif len(self.transitions) == 0:
|
||||
return []
|
||||
else:
|
||||
raise ValueError("The transitions have no such attribute name")
|
||||
|
||||
def to_batch(self):
|
||||
batch = []
|
||||
for i in range(self.length()):
|
||||
batch.append(self.get_transition(i))
|
||||
return batch
|
||||
73
rl_coach/dashboard.py
Normal file
73
rl_coach/dashboard.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""
|
||||
To run Coach Dashboard, run the following command:
|
||||
python3 dashboard.py
|
||||
"""
|
||||
|
||||
import sys
|
||||
sys.path.append('.')
|
||||
|
||||
import os
|
||||
|
||||
from rl_coach.dashboard_components.experiment_board import display_directory_group, display_files
|
||||
from rl_coach.dashboard_components.globals import doc
|
||||
import rl_coach.dashboard_components.boards
|
||||
from rl_coach.dashboard_components.landing_page import landing_page
|
||||
|
||||
doc.add_root(landing_page)
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-d', '--experiment_dir',
|
||||
help="(string) The path of an experiment dir to open",
|
||||
default=None,
|
||||
type=str)
|
||||
parser.add_argument('-f', '--experiment_files',
|
||||
help="(string) The path of an experiment file to open",
|
||||
default=None,
|
||||
type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.experiment_dir:
|
||||
doc.add_timeout_callback(lambda: display_directory_group(args.experiment_dir), 1000)
|
||||
elif args.experiment_files:
|
||||
files = []
|
||||
for file_pattern in args.experiment_files:
|
||||
files.extend(glob.glob(args.experiment_files))
|
||||
doc.add_timeout_callback(lambda: display_files(files), 1000)
|
||||
|
||||
|
||||
def main():
|
||||
from rl_coach.utils import get_open_port
|
||||
|
||||
dashboard_path = os.path.realpath(__file__)
|
||||
command = 'bokeh serve --show {} --port {}'.format(dashboard_path, get_open_port())
|
||||
if args.experiment_dir or args.experiment_files:
|
||||
command += ' --args'
|
||||
if args.experiment_dir:
|
||||
command += ' --experiment_dir {}'.format(args.experiment_dir)
|
||||
if args.experiment_files:
|
||||
command += ' --experiment_files {}'.format(args.experiment_files)
|
||||
|
||||
os.system(command)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
rl_coach/dashboard_components/__init__.py
Normal file
0
rl_coach/dashboard_components/__init__.py
Normal file
21
rl_coach/dashboard_components/boards.py
Normal file
21
rl_coach/dashboard_components/boards.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from bokeh.layouts import column
|
||||
from bokeh.models.widgets import Panel, Tabs
|
||||
from rl_coach.dashboard_components.experiment_board import experiment_board_layout
|
||||
from rl_coach.dashboard_components.episodic_board import episodic_board_layout
|
||||
from rl_coach.dashboard_components.globals import spinner, layouts
|
||||
from bokeh.models.widgets import Div
|
||||
|
||||
# ---------------- Build Website Layout -------------------
|
||||
|
||||
# title
|
||||
title = Div(text="""<h1>Coach Dashboard</h1>""")
|
||||
center = Div(text="""<style>html { padding-left: 50px; } </style>""")
|
||||
tab1 = Panel(child=experiment_board_layout, title='experiment board')
|
||||
# tab2 = Panel(child=episodic_board_layout, title='episodic board')
|
||||
# tabs = Tabs(tabs=[tab1, tab2])
|
||||
tabs = Tabs(tabs=[tab1])
|
||||
|
||||
layout = column(title, center, tabs)
|
||||
layout = column(layout, spinner)
|
||||
|
||||
layouts['boards'] = layout
|
||||
99
rl_coach/dashboard_components/episodic_board.py
Normal file
99
rl_coach/dashboard_components/episodic_board.py
Normal file
@@ -0,0 +1,99 @@
|
||||
|
||||
from bokeh.layouts import row, column, widgetbox, Spacer
|
||||
from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend
|
||||
from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup, Toggle
|
||||
from bokeh.plotting import figure
|
||||
from rl_coach.dashboard_components.globals import layouts, crcolor, crx, cry, color_resolution, crRGBs
|
||||
from rl_coach.dashboard_components.experiment_board import file_selection_button, files_selector_spacer, \
|
||||
group_selection_button, unload_file_button, files_selector
|
||||
|
||||
# ---------------- Build Website Layout -------------------
|
||||
|
||||
# file refresh time placeholder
|
||||
refresh_info = Div(text="""""", width=210)
|
||||
|
||||
# create figures
|
||||
plot = figure(plot_width=1200, plot_height=800,
|
||||
tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
|
||||
toolbar_location='above', x_axis_label='Episodes',
|
||||
x_range=Range1d(0, 10000), y_range=Range1d(0, 100000))
|
||||
plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
|
||||
plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
|
||||
plot.yaxis[-1].visible = False
|
||||
|
||||
# legend
|
||||
div = Div(text="""""")
|
||||
legend = widgetbox([div])
|
||||
|
||||
bokeh_legend = Legend(
|
||||
# items=[("12345678901234567890123456789012345678901234567890", [])], # 50 letters
|
||||
items=[("__________________________________________________", [])], # 50 letters
|
||||
location=(0, 0), orientation="vertical",
|
||||
border_line_color="black",
|
||||
label_text_font_size={'value': '9pt'},
|
||||
margin=30
|
||||
)
|
||||
plot.add_layout(bokeh_legend, "right")
|
||||
|
||||
# select file
|
||||
file_selection_button = Button(label="Select Files", button_type="success", width=120)
|
||||
# file_selection_button.on_click(load_files_group)
|
||||
|
||||
files_selector_spacer = Spacer(width=10)
|
||||
|
||||
group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
|
||||
# group_selection_button.on_click(load_directory_group)
|
||||
|
||||
unload_file_button = Button(label="Unload", button_type="danger", width=50)
|
||||
# unload_file_button.on_click(unload_file)
|
||||
|
||||
# files selection box
|
||||
files_selector = Select(title="Files:", options=[])
|
||||
# files_selector.on_change('value', change_data_selector)
|
||||
|
||||
# data selection box
|
||||
data_selector = MultiSelect(title="Data:", options=[], size=12)
|
||||
# data_selector.on_change('value', select_data)
|
||||
|
||||
# toggle second axis button
|
||||
toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
|
||||
# toggle_second_axis_button.on_click(toggle_second_axis)
|
||||
|
||||
# averaging slider
|
||||
averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10)
|
||||
# averaging_slider.on_change('value', update_averaging)
|
||||
|
||||
# color selector
|
||||
color_selector_title = Div(text="""Select Color:""")
|
||||
crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
|
||||
color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
|
||||
plot_width=300, plot_height=40,
|
||||
tools='tap')
|
||||
color_selector.axis.visible = False
|
||||
color_range = color_selector.rect(x='x', y='y', width=1, height=10,
|
||||
color='crcolor', source=crsource)
|
||||
# crsource.on_change('selected', select_color)
|
||||
color_range.nonselection_glyph = color_range.glyph
|
||||
color_selector.toolbar.logo = None
|
||||
color_selector.toolbar_location = None
|
||||
|
||||
episode_selector = MultiSelect(title="Episode:", options=['0', '1', '2', '3', '4'], size=1)
|
||||
|
||||
online_toggle = Toggle(label="Online", button_type="success")
|
||||
|
||||
# main layout of the document
|
||||
layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
|
||||
layout = column(layout, files_selector)
|
||||
layout = column(layout, row(refresh_info, unload_file_button))
|
||||
layout = column(layout, data_selector)
|
||||
layout = column(layout, color_selector_title)
|
||||
layout = column(layout, color_selector)
|
||||
layout = column(layout, toggle_second_axis_button)
|
||||
layout = column(layout, averaging_slider)
|
||||
layout = column(layout, episode_selector)
|
||||
layout = column(layout, online_toggle)
|
||||
layout = row(layout, plot)
|
||||
|
||||
episodic_board_layout = layout
|
||||
|
||||
layouts["episodic_board"] = episodic_board_layout
|
||||
564
rl_coach/dashboard_components/experiment_board.py
Normal file
564
rl_coach/dashboard_components/experiment_board.py
Normal file
@@ -0,0 +1,564 @@
|
||||
import copy
|
||||
import datetime
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from itertools import cycle
|
||||
from os import listdir
|
||||
from os.path import isfile, join, isdir
|
||||
|
||||
from bokeh.layouts import row, column, Spacer, ToolbarBox
|
||||
from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend, \
|
||||
WheelZoomTool, CrosshairTool, ResetTool, SaveTool, Toolbar, PanTool, BoxZoomTool, \
|
||||
Toggle
|
||||
from bokeh.models.callbacks import CustomJS
|
||||
from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup
|
||||
from bokeh.plotting import figure
|
||||
from rl_coach.dashboard_components.globals import signals_files, x_axis_labels, x_axis_options, show_spinner, hide_spinner, \
|
||||
dialog, FolderType, RunType, add_directory_csv_files, doc, display_boards, layouts, \
|
||||
crcolor, crx, cry, color_resolution, crRGBs, rgb_to_hex, x_axis
|
||||
from rl_coach.dashboard_components.signals_files_group import SignalsFilesGroup
|
||||
|
||||
from rl_coach.dashboard_components.signals_file import SignalsFile
|
||||
|
||||
|
||||
def update_axis_range(name, range_placeholder):
|
||||
max_val = -float('inf')
|
||||
min_val = float('inf')
|
||||
selected_signal = None
|
||||
if name in x_axis_options:
|
||||
selected_signal = name
|
||||
for signals_file in signals_files.values():
|
||||
curr_min_val, curr_max_val = signals_file.get_range_of_selected_signals_on_axis(name, selected_signal)
|
||||
max_val = max(max_val, curr_max_val)
|
||||
min_val = min(min_val, curr_min_val)
|
||||
if min_val != float('inf'):
|
||||
if min_val == max_val:
|
||||
range = 5
|
||||
else:
|
||||
range = max_val - min_val
|
||||
range_placeholder.start = min_val - 0.1 * range
|
||||
range_placeholder.end = max_val + 0.1 * range
|
||||
|
||||
|
||||
# update axes ranges
|
||||
def update_y_axis_ranges():
|
||||
update_axis_range('default', plot.y_range)
|
||||
update_axis_range('secondary', plot.extra_y_ranges['secondary'])
|
||||
|
||||
|
||||
def update_x_axis_ranges():
|
||||
update_axis_range(x_axis[0], plot.x_range)
|
||||
|
||||
|
||||
def get_all_selected_signals():
|
||||
signals = []
|
||||
for signals_file in signals_files.values():
|
||||
signals += signals_file.get_selected_signals()
|
||||
return signals
|
||||
|
||||
|
||||
# update legend using the legend text dictionary
|
||||
def update_legend():
|
||||
selected_signals = get_all_selected_signals()
|
||||
max_line_length = 50
|
||||
items = []
|
||||
for signal in selected_signals:
|
||||
side_sign = "◀" if signal.axis == 'default' else "▶"
|
||||
signal_name = side_sign + " " + signal.full_name
|
||||
# bokeh legend does not respect a max_width parameter so we split the text manually to lines of constant width
|
||||
signal_name = [signal_name[n:n + max_line_length] for n in range(0, len(signal_name), max_line_length)]
|
||||
for idx, substr in enumerate(signal_name):
|
||||
if idx == 0:
|
||||
lines = [signal.line]
|
||||
if signal.show_bollinger_bands:
|
||||
lines.append(signal.bands)
|
||||
items.append((substr, lines))
|
||||
else:
|
||||
items.append((substr, []))
|
||||
|
||||
if bokeh_legend.items == [] or items == [] or \
|
||||
any([legend_item.renderers != item[1] for legend_item, item in zip(bokeh_legend.items, items)])\
|
||||
or any([legend_item.label != item[0] for legend_item, item in zip(bokeh_legend.items, items)]):
|
||||
bokeh_legend.items = items # this step takes a long time because it is redrawing the plot
|
||||
|
||||
# the visible=false => visible=true is a hack to make the legend render again
|
||||
bokeh_legend.visible = False
|
||||
bokeh_legend.visible = True
|
||||
|
||||
|
||||
# select lines to display
|
||||
def select_data(args, old, new):
|
||||
if selected_file is None:
|
||||
return
|
||||
show_spinner("Updating the signal selection...")
|
||||
selected_signals = new
|
||||
for signal_name in selected_file.signals.keys():
|
||||
is_selected = signal_name in selected_signals
|
||||
selected_file.set_signal_selection(signal_name, is_selected)
|
||||
|
||||
# update axes ranges
|
||||
update_y_axis_ranges()
|
||||
update_x_axis_ranges()
|
||||
|
||||
# update the legend
|
||||
update_legend()
|
||||
|
||||
hide_spinner()
|
||||
|
||||
|
||||
# add new lines to the plot
|
||||
def plot_signals(signals_file, signals):
|
||||
for idx, signal in enumerate(signals):
|
||||
signal.line = plot.line('index', signal.name, source=signals_file.bokeh_source,
|
||||
line_color=signal.color, line_width=2)
|
||||
|
||||
|
||||
def open_file_dialog():
|
||||
return dialog.getFileDialog()
|
||||
|
||||
|
||||
def open_directory_dialog():
|
||||
return dialog.getDirDialog()
|
||||
|
||||
|
||||
# will create a group from the files
|
||||
def create_files_group_signal(files):
|
||||
global selected_file
|
||||
signals_file = SignalsFilesGroup(files, plot)
|
||||
|
||||
signals_files[signals_file.filename] = signals_file
|
||||
|
||||
filenames = [signals_file.filename]
|
||||
if files_selector.options[0] == "":
|
||||
files_selector.options = filenames
|
||||
else:
|
||||
files_selector.options = files_selector.options + filenames
|
||||
files_selector.value = filenames[0]
|
||||
selected_file = signals_file
|
||||
|
||||
|
||||
# load files from disk as a group
|
||||
def load_files_group():
|
||||
show_spinner("Loading files group...")
|
||||
files = open_file_dialog()
|
||||
# no files selected
|
||||
if not files or not files[0]:
|
||||
hide_spinner()
|
||||
return
|
||||
|
||||
display_boards()
|
||||
|
||||
if len(files) == 1:
|
||||
create_files_signal(files)
|
||||
else:
|
||||
create_files_group_signal(files)
|
||||
|
||||
change_selected_signals_in_data_selector([""])
|
||||
hide_spinner()
|
||||
|
||||
|
||||
# classify the folder as containing a single file, multiple files or only folders
|
||||
def classify_folder(dir_path):
|
||||
files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')]
|
||||
folders = [d for d in listdir(dir_path) if isdir(join(dir_path, d)) and any(f.endswith(".csv") for f in os.listdir(join(dir_path, d)))]
|
||||
if len(files) == 1:
|
||||
return FolderType.SINGLE_FILE
|
||||
elif len(files) > 1:
|
||||
return FolderType.MULTIPLE_FILES
|
||||
elif len(folders) == 1:
|
||||
return classify_folder(join(dir_path, folders[0]))
|
||||
elif len(folders) > 1:
|
||||
return FolderType.MULTIPLE_FOLDERS
|
||||
else:
|
||||
return FolderType.EMPTY
|
||||
|
||||
|
||||
# finds if this is single-threaded or multi-threaded
|
||||
def get_run_type(dir_path):
|
||||
folder_type = classify_folder(dir_path)
|
||||
if folder_type == FolderType.SINGLE_FILE:
|
||||
folder_type = RunType.SINGLE_FOLDER_SINGLE_FILE
|
||||
|
||||
elif folder_type == FolderType.MULTIPLE_FILES:
|
||||
folder_type = RunType.SINGLE_FOLDER_MULTIPLE_FILES
|
||||
|
||||
elif folder_type == FolderType.MULTIPLE_FOLDERS:
|
||||
# folder contains sub dirs -> we assume we can classify the folder using only the first sub dir
|
||||
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
|
||||
|
||||
# checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the
|
||||
# same structure (i.e. if one is a result of multi-threaded run, so will all the other).
|
||||
folder_type = classify_folder(os.path.join(dir_path, sub_dirs[0]))
|
||||
if folder_type == FolderType.SINGLE_FILE:
|
||||
folder_type = RunType.MULTIPLE_FOLDERS_SINGLE_FILES
|
||||
elif folder_type == FolderType.MULTIPLE_FILES:
|
||||
folder_type = RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES
|
||||
|
||||
return folder_type
|
||||
|
||||
|
||||
# create a signal file from the directory path according to the directory underlying structure
|
||||
def handle_dir(dir_path, run_type):
|
||||
paths = add_directory_csv_files(dir_path)
|
||||
if run_type in [RunType.SINGLE_FOLDER_MULTIPLE_FILES,
|
||||
RunType.MULTIPLE_FOLDERS_SINGLE_FILES]:
|
||||
create_files_group_signal(paths)
|
||||
elif run_type == RunType.SINGLE_FOLDER_SINGLE_FILE:
|
||||
create_files_signal(paths, use_dir_name=True)
|
||||
elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES:
|
||||
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
|
||||
create_files_group_signal([os.path.join(dir_path, d) for d in sub_dirs])
|
||||
|
||||
|
||||
# load directory from disk as a group
|
||||
def load_directory_group():
|
||||
show_spinner("Loading directories group...")
|
||||
directory = open_directory_dialog()
|
||||
# no files selected
|
||||
if not directory:
|
||||
hide_spinner()
|
||||
return
|
||||
|
||||
display_directory_group(directory)
|
||||
|
||||
|
||||
def display_directory_group(directory):
|
||||
pause_auto_update()
|
||||
|
||||
display_boards()
|
||||
show_spinner("Loading directories group...")
|
||||
|
||||
while get_run_type(directory) == FolderType.EMPTY:
|
||||
show_spinner("Waiting for experiment directory to get populated...")
|
||||
sys.stdout.write("Waiting for experiment directory to get populated...\r")
|
||||
time.sleep(10)
|
||||
|
||||
handle_dir(directory, get_run_type(directory))
|
||||
|
||||
change_selected_signals_in_data_selector([""])
|
||||
|
||||
resume_auto_update_according_to_toggle()
|
||||
hide_spinner()
|
||||
|
||||
|
||||
def create_files_signal(files, use_dir_name=False):
|
||||
global selected_file
|
||||
new_signal_files = []
|
||||
for idx, file_path in enumerate(files):
|
||||
signals_file = SignalsFile(str(file_path), plot=plot, use_dir_name=use_dir_name)
|
||||
signals_files[signals_file.filename] = signals_file
|
||||
new_signal_files.append(signals_file)
|
||||
|
||||
filenames = [f.filename for f in new_signal_files]
|
||||
|
||||
if files_selector.options[0] == "":
|
||||
files_selector.options = filenames
|
||||
else:
|
||||
files_selector.options = files_selector.options + filenames
|
||||
files_selector.value = filenames[0]
|
||||
selected_file = new_signal_files[0]
|
||||
|
||||
|
||||
# load files from disk
|
||||
def load_files():
|
||||
show_spinner("Loading files...")
|
||||
files = open_file_dialog()
|
||||
|
||||
# no files selected
|
||||
if not files or not files[0]:
|
||||
hide_spinner()
|
||||
return
|
||||
|
||||
display_files(files)
|
||||
|
||||
|
||||
def display_files(files):
|
||||
pause_auto_update()
|
||||
|
||||
display_boards()
|
||||
show_spinner("Loading files...")
|
||||
|
||||
create_files_signal(files)
|
||||
|
||||
change_selected_signals_in_data_selector([""])
|
||||
|
||||
resume_auto_update_according_to_toggle()
|
||||
hide_spinner()
|
||||
|
||||
|
||||
def unload_file():
|
||||
global selected_file
|
||||
if selected_file is None:
|
||||
return
|
||||
selected_file.hide_all_signals()
|
||||
del signals_files[selected_file.filename]
|
||||
data_selector.options = [""]
|
||||
filenames_list = copy.copy(files_selector.options)
|
||||
filenames_list.remove(selected_file.filename)
|
||||
if len(filenames_list) == 0:
|
||||
filenames_list = [""]
|
||||
files_selector.options = filenames_list
|
||||
filenames = cycle(filenames_list)
|
||||
if files_selector.options[0] != "":
|
||||
files_selector.value = next(filenames)
|
||||
else:
|
||||
files_selector.value = None
|
||||
|
||||
update_legend()
|
||||
refresh_info.text = ""
|
||||
if len(signals_files) == 0:
|
||||
selected_file = None
|
||||
|
||||
|
||||
# reload the selected csv file
|
||||
def reload_all_files(force=False):
|
||||
pause_auto_update()
|
||||
|
||||
for file_to_load in signals_files.values():
|
||||
if force or file_to_load.file_was_modified_on_disk():
|
||||
show_spinner("Updating files from the disk...")
|
||||
file_to_load.load()
|
||||
hide_spinner()
|
||||
refresh_info.text = "Last Update: " + str(datetime.datetime.now()).split(".")[0]
|
||||
|
||||
resume_auto_update_according_to_toggle()
|
||||
|
||||
|
||||
# unselect the currently selected signals and then select the requested signals in the data selector
|
||||
def change_selected_signals_in_data_selector(selected_signals):
|
||||
# the default bokeh way is not working due to a bug since Bokeh 0.12.6 (https://github.com/bokeh/bokeh/issues/6501)
|
||||
# remove the data selection callback before updating the selector
|
||||
data_selector.remove_on_change('value', select_data)
|
||||
for value in list(data_selector.value):
|
||||
if value in data_selector.options:
|
||||
index = data_selector.options.index(value)
|
||||
data_selector.options.remove(value)
|
||||
data_selector.value.remove(value)
|
||||
data_selector.options.insert(index, value)
|
||||
data_selector.value = selected_signals
|
||||
# add back the data selection callback
|
||||
data_selector.on_change('value', select_data)
|
||||
|
||||
|
||||
# change data options according to the selected file
|
||||
def change_data_selector(args, old, new):
|
||||
global selected_file
|
||||
if new is None:
|
||||
selected_file = None
|
||||
return
|
||||
show_spinner("Updating selection...")
|
||||
selected_file = signals_files[new]
|
||||
if isinstance(selected_file, SignalsFile):
|
||||
group_cb.disabled = True
|
||||
elif isinstance(selected_file, SignalsFilesGroup):
|
||||
group_cb.disabled = False
|
||||
data_selector.remove_on_change('value', select_data)
|
||||
data_selector.options = sorted(list(selected_file.signals.keys()))
|
||||
data_selector.on_change('value', select_data)
|
||||
selected_signal_names = [s.name for s in selected_file.signals.values() if s.selected]
|
||||
if not selected_signal_names:
|
||||
selected_signal_names = [""]
|
||||
change_selected_signals_in_data_selector(selected_signal_names)
|
||||
averaging_slider.value = selected_file.signals_averaging_window
|
||||
if len(averaging_slider_dummy_source.data['value']) > 0:
|
||||
averaging_slider_dummy_source.data['value'][0] = selected_file.signals_averaging_window
|
||||
group_cb.active = [0 if selected_file.show_bollinger_bands else None]
|
||||
group_cb.active += [1 if selected_file.separate_files else None]
|
||||
hide_spinner()
|
||||
|
||||
|
||||
# smooth all the signals of the selected file
|
||||
def update_averaging(args, old, new):
|
||||
show_spinner("Smoothing the signals...")
|
||||
# get the actual value from the dummy source
|
||||
new = averaging_slider_dummy_source.data['value'][0]
|
||||
selected_file.change_averaging_window(new)
|
||||
hide_spinner()
|
||||
|
||||
|
||||
def change_x_axis(val):
|
||||
global x_axis
|
||||
show_spinner("Updating the X axis...")
|
||||
x_axis[0] = x_axis_options[val]
|
||||
plot.xaxis.axis_label = x_axis_labels[val]
|
||||
|
||||
for file_to_load in signals_files.values():
|
||||
file_to_load.update_x_axis_index()
|
||||
# this is needed in order to recalculate the mean of all the files
|
||||
if isinstance(file_to_load, SignalsFilesGroup):
|
||||
file_to_load.load()
|
||||
|
||||
update_axis_range(x_axis[0], plot.x_range)
|
||||
hide_spinner()
|
||||
|
||||
|
||||
# move the signal between the main and secondary Y axes
|
||||
def toggle_second_axis():
|
||||
show_spinner("Switching the Y axis...")
|
||||
plot.yaxis[-1].visible = True
|
||||
selected_file.toggle_y_axis()
|
||||
|
||||
# this is just for redrawing the signals
|
||||
selected_file.reload_data()
|
||||
|
||||
update_y_axis_ranges()
|
||||
update_legend()
|
||||
|
||||
hide_spinner()
|
||||
|
||||
|
||||
def toggle_group_property(new):
|
||||
show_spinner("Loading...")
|
||||
|
||||
# toggle show / hide Bollinger bands
|
||||
selected_file.change_bollinger_bands_state(0 in new)
|
||||
|
||||
# show a separate signal for each file in a group
|
||||
selected_file.show_files_separately(1 in new)
|
||||
|
||||
update_legend()
|
||||
|
||||
hide_spinner()
|
||||
|
||||
|
||||
# Color selection - most of these functions are taken from bokeh examples (plotting/color_sliders.py)
|
||||
def select_color(attr, old, new):
|
||||
show_spinner("Changing signal color...")
|
||||
signals = selected_file.get_selected_signals()
|
||||
for signal in signals:
|
||||
signal.set_color(rgb_to_hex(crRGBs[new['1d']['indices'][0]]))
|
||||
hide_spinner()
|
||||
|
||||
|
||||
def pause_auto_update():
|
||||
toggle_auto_update(False)
|
||||
|
||||
|
||||
def resume_auto_update_according_to_toggle():
|
||||
toggle_auto_update(auto_update_toggle_button.active)
|
||||
|
||||
|
||||
def toggle_auto_update(new):
|
||||
global file_update_callback
|
||||
if new is False and file_update_callback in doc._session_callbacks:
|
||||
doc.remove_periodic_callback(file_update_callback)
|
||||
elif file_update_callback not in doc._session_callbacks:
|
||||
file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
|
||||
|
||||
|
||||
file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
|
||||
|
||||
# ---------------- Build Website Layout -------------------
|
||||
|
||||
# file refresh time placeholder
|
||||
refresh_info = Div(text="""""", width=210)
|
||||
|
||||
# create figures
|
||||
plot = figure(plot_width=1200, plot_height=800,
|
||||
# tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
|
||||
toolbar_location=None, x_axis_label='Episodes',
|
||||
x_range=Range1d(0, 10000), y_range=Range1d(0, 100000), lod_factor=1000)
|
||||
plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
|
||||
plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
|
||||
toolbar = Toolbar(tools=[PanTool(), BoxZoomTool(), WheelZoomTool(), CrosshairTool(), ResetTool(), SaveTool()])
|
||||
# plot.toolbar = toolbar
|
||||
plot.add_tools(*toolbar.tools)
|
||||
plot.yaxis[-1].visible = False
|
||||
|
||||
bokeh_legend = Legend(
|
||||
items=[("", [])],
|
||||
orientation="vertical",
|
||||
border_line_color="black",
|
||||
label_text_font_size={'value': '9pt'},
|
||||
click_policy='hide',
|
||||
visible=False
|
||||
)
|
||||
bokeh_legend.label_width = 100
|
||||
plot.add_layout(bokeh_legend, "right")
|
||||
plot.y_range = Range1d(0, 100)
|
||||
plot.extra_y_ranges['secondary'] = Range1d(0, 100)
|
||||
|
||||
# select file
|
||||
file_selection_button = Button(label="Select Files", button_type="success", width=120)
|
||||
file_selection_button.on_click(load_files_group)
|
||||
|
||||
files_selector_spacer = Spacer(width=10)
|
||||
|
||||
group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
|
||||
group_selection_button.on_click(load_directory_group)
|
||||
|
||||
update_files_button = Button(label="Update Files", button_type="default", width=50)
|
||||
update_files_button.on_click(reload_all_files)
|
||||
|
||||
auto_update_toggle_button = Toggle(label="Auto Update", button_type="default", width=50, active=True)
|
||||
auto_update_toggle_button.on_click(toggle_auto_update)
|
||||
|
||||
unload_file_button = Button(label="Unload", button_type="danger", width=50)
|
||||
unload_file_button.on_click(unload_file)
|
||||
|
||||
# files selection box
|
||||
files_selector = Select(title="Files:", options=[""])
|
||||
files_selector.on_change('value', change_data_selector)
|
||||
|
||||
# data selection box
|
||||
data_selector = MultiSelect(title="Data:", options=[], size=12)
|
||||
data_selector.on_change('value', select_data)
|
||||
|
||||
# x axis selection box
|
||||
x_axis_selector_title = Div(text="""X Axis:""", height=10)
|
||||
x_axis_selector = RadioButtonGroup(labels=x_axis_options, active=0)
|
||||
x_axis_selector.on_click(change_x_axis)
|
||||
|
||||
# toggle second axis button
|
||||
toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
|
||||
toggle_second_axis_button.on_click(toggle_second_axis)
|
||||
|
||||
# averaging slider
|
||||
# This data source is just used to communicate / trigger the real callback
|
||||
averaging_slider_dummy_source = ColumnDataSource(data=dict(value=[]))
|
||||
averaging_slider_dummy_source.on_change('data', update_averaging)
|
||||
averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10, callback_policy='mouseup')
|
||||
averaging_slider.callback = CustomJS(args=dict(source=averaging_slider_dummy_source), code="""
|
||||
source.data = { value: [cb_obj.value] }
|
||||
""")
|
||||
|
||||
# group properties checkbox
|
||||
group_cb = CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[])
|
||||
group_cb.on_click(toggle_group_property)
|
||||
|
||||
# color selector
|
||||
color_selector_title = Div(text="""Select Color:""")
|
||||
crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
|
||||
color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
|
||||
plot_width=300, plot_height=40,
|
||||
tools='tap')
|
||||
color_selector.axis.visible = False
|
||||
color_range = color_selector.rect(x='x', y='y', width=1, height=10,
|
||||
color='crcolor', source=crsource)
|
||||
crsource.on_change('selected', select_color)
|
||||
color_range.nonselection_glyph = color_range.glyph
|
||||
color_selector.toolbar.logo = None
|
||||
color_selector.toolbar_location = None
|
||||
|
||||
# main layout of the document
|
||||
layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
|
||||
layout = column(layout, files_selector)
|
||||
layout = column(layout, row(update_files_button, Spacer(width=50), auto_update_toggle_button,
|
||||
Spacer(width=50), unload_file_button))
|
||||
layout = column(layout, row(refresh_info))
|
||||
layout = column(layout, data_selector)
|
||||
layout = column(layout, color_selector_title)
|
||||
layout = column(layout, color_selector)
|
||||
layout = column(layout, x_axis_selector_title)
|
||||
layout = column(layout, x_axis_selector)
|
||||
layout = column(layout, group_cb)
|
||||
layout = column(layout, toggle_second_axis_button)
|
||||
layout = column(layout, averaging_slider)
|
||||
toolbox = ToolbarBox(toolbar=toolbar, toolbar_location='above')
|
||||
panel = column(toolbox, plot)
|
||||
layout = row(layout, panel)
|
||||
|
||||
experiment_board_layout = layout
|
||||
|
||||
layouts["experiment_board"] = experiment_board_layout
|
||||
136
rl_coach/dashboard_components/globals.py
Normal file
136
rl_coach/dashboard_components/globals.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import os
|
||||
from genericpath import isdir, isfile
|
||||
from os import listdir
|
||||
from os.path import join
|
||||
from enum import Enum
|
||||
from bokeh.models import Div
|
||||
from bokeh.plotting import curdoc
|
||||
import wx
|
||||
import colorsys
|
||||
|
||||
patches = {}
|
||||
signals_files = {}
|
||||
selected_file = None
|
||||
x_axis = ['Episode #']
|
||||
x_axis_options = ['Episode #', 'Total steps', 'Wall-Clock Time']
|
||||
x_axis_labels = ['Episode #', 'Total steps (per worker)', 'Wall-Clock Time (minutes)']
|
||||
current_color = 0
|
||||
|
||||
# spinner
|
||||
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
with open(os.path.join(root_dir, 'dashboard_components/spinner.css'), 'r') as f:
|
||||
spinner_style = """<style>{}</style>""".format(f.read())
|
||||
spinner_html = """<ul class="spinner"><li></li><li></li><li></li><li></li>
|
||||
<li>
|
||||
<br>
|
||||
<span style="font-size: 24px; font-weight: bold; margin-left: -175px; width: 400px;
|
||||
position: absolute; text-align: center;">
|
||||
{}
|
||||
</span>
|
||||
</li></ul>"""
|
||||
spinner = Div(text="""""")
|
||||
displayed_doc = "landing_page"
|
||||
layouts = {}
|
||||
|
||||
|
||||
def generate_color_range(N, I):
|
||||
HSV_tuples = [(x*1.0/N, 0.5, I) for x in range(N)]
|
||||
RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
|
||||
for_conversion = []
|
||||
for RGB_tuple in RGB_tuples:
|
||||
for_conversion.append((int(RGB_tuple[0]*255), int(RGB_tuple[1]*255), int(RGB_tuple[2]*255)))
|
||||
hex_colors = [rgb_to_hex(RGB_tuple) for RGB_tuple in for_conversion]
|
||||
return hex_colors, for_conversion
|
||||
|
||||
|
||||
# convert RGB tuple to hexadecimal code
|
||||
def rgb_to_hex(rgb):
|
||||
return '#%02x%02x%02x' % rgb
|
||||
|
||||
|
||||
# convert hexadecimal to RGB tuple
|
||||
def hex_to_dec(hex):
|
||||
red = ''.join(hex.strip('#')[0:2])
|
||||
green = ''.join(hex.strip('#')[2:4])
|
||||
blue = ''.join(hex.strip('#')[4:6])
|
||||
return int(red, 16), int(green, 16), int(blue,16)
|
||||
|
||||
|
||||
color_resolution = 1000
|
||||
brightness = 0.75 # change to have brighter/darker colors
|
||||
crx = list(range(1, color_resolution+1)) # the resolution is 1000 colors
|
||||
cry = [5 for i in range(len(crx))]
|
||||
crcolor, crRGBs = generate_color_range(color_resolution, brightness) # produce spectrum
|
||||
|
||||
|
||||
def display_boards():
|
||||
global displayed_doc
|
||||
if displayed_doc == "landing_page":
|
||||
doc.remove_root(doc.roots[0])
|
||||
doc.add_root(layouts["boards"])
|
||||
displayed_doc = "boards"
|
||||
|
||||
|
||||
def show_spinner(text="Loading..."):
|
||||
spinner.text = spinner_style + spinner_html.format(text)
|
||||
|
||||
|
||||
def hide_spinner():
|
||||
spinner.text = ""
|
||||
|
||||
|
||||
# takes path to dir and recursively adds all it's files to paths
|
||||
def add_directory_csv_files(dir_path, paths=None):
|
||||
if not paths:
|
||||
paths = []
|
||||
|
||||
for p in listdir(dir_path):
|
||||
path = join(dir_path, p)
|
||||
if isdir(path):
|
||||
# call recursively for each dir
|
||||
paths = add_directory_csv_files(path, paths)
|
||||
elif isfile(path) and path.endswith('.csv'):
|
||||
# add every file to the list
|
||||
paths.append(path)
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
class DialogApp(wx.App):
|
||||
def getFileDialog(self):
|
||||
with wx.FileDialog(None, "Open CSV file", wildcard="CSV files (*.csv)|*.csv",
|
||||
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR | wx.FD_MULTIPLE) as fileDialog:
|
||||
if fileDialog.ShowModal() == wx.ID_CANCEL:
|
||||
return None # the user changed their mind
|
||||
else:
|
||||
# Proceed loading the file chosen by the user
|
||||
return fileDialog.GetPaths()
|
||||
|
||||
def getDirDialog(self):
|
||||
with wx.DirDialog(None, "Choose input directory", "",
|
||||
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR) as dirDialog:
|
||||
if dirDialog.ShowModal() == wx.ID_CANCEL:
|
||||
return None # the user changed their mind
|
||||
else:
|
||||
# Proceed loading the dir chosen by the user
|
||||
return dirDialog.GetPath()
|
||||
|
||||
|
||||
class RunType(Enum):
|
||||
SINGLE_FOLDER_SINGLE_FILE = 1
|
||||
SINGLE_FOLDER_MULTIPLE_FILES = 2
|
||||
MULTIPLE_FOLDERS_SINGLE_FILES = 3
|
||||
MULTIPLE_FOLDERS_MULTIPLE_FILES = 4
|
||||
UNKNOWN = 0
|
||||
|
||||
|
||||
class FolderType(Enum):
|
||||
SINGLE_FILE = 1
|
||||
MULTIPLE_FILES = 2
|
||||
MULTIPLE_FOLDERS = 3
|
||||
EMPTY = 4
|
||||
|
||||
|
||||
dialog = DialogApp()
|
||||
|
||||
doc = curdoc()
|
||||
22
rl_coach/dashboard_components/landing_page.py
Normal file
22
rl_coach/dashboard_components/landing_page.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from bokeh.layouts import row, column
|
||||
from bokeh.models.widgets import Div
|
||||
|
||||
from rl_coach.dashboard_components.experiment_board import file_selection_button, group_selection_button
|
||||
from rl_coach.dashboard_components.globals import layouts
|
||||
|
||||
# title
|
||||
title = Div(text="""<h1>Coach Dashboard</h1>""")
|
||||
|
||||
# landing page
|
||||
landing_page_description = Div(text="""<h3>Start by selecting an experiment file or directory to open:</h3>""")
|
||||
center = Div(text="""<style>html { text-align: center; } </style>""")
|
||||
center_buttons = Div(text="""<style>.bk-root .bk-widget { margin: 0 auto; }</style>""", width=0)
|
||||
landing_page = column(center,
|
||||
title,
|
||||
landing_page_description,
|
||||
row(center_buttons),
|
||||
row(file_selection_button, sizing_mode='scale_width'),
|
||||
row(group_selection_button, sizing_mode='scale_width'),
|
||||
sizing_mode='scale_width')
|
||||
|
||||
layouts['landing_page'] = landing_page
|
||||
125
rl_coach/dashboard_components/signals.py
Normal file
125
rl_coach/dashboard_components/signals.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from bokeh.models import ColumnDataSource
|
||||
from bokeh.palettes import Dark2
|
||||
from rl_coach.dashboard_components.globals import show_spinner, hide_spinner, current_color
|
||||
from rl_coach.utils import squeeze_list
|
||||
|
||||
|
||||
class Signal:
|
||||
def __init__(self, name, parent, plot):
|
||||
self.name = name
|
||||
self.full_name = "{}/{}".format(parent.filename, self.name)
|
||||
self.plot = plot
|
||||
self.selected = False
|
||||
self.color = random.choice(Dark2[8])
|
||||
self.line = None
|
||||
self.scatter = None
|
||||
self.bands = None
|
||||
self.bokeh_source = parent.bokeh_source
|
||||
self.min_val = 0
|
||||
self.max_val = 0
|
||||
self.axis = 'default'
|
||||
self.sub_signals = []
|
||||
for name in self.bokeh_source.data.keys():
|
||||
if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name:
|
||||
self.sub_signals.append(name)
|
||||
if len(self.sub_signals) > 1:
|
||||
self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]])
|
||||
self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]])
|
||||
self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]])
|
||||
self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]])
|
||||
else:
|
||||
self.mean_signal = squeeze_list(self.name)
|
||||
self.stdev_signal = None
|
||||
self.min_signal = None
|
||||
self.max_signal = None
|
||||
self.has_bollinger_bands = False
|
||||
if self.mean_signal and self.stdev_signal and self.min_signal and self.max_signal:
|
||||
self.has_bollinger_bands = True
|
||||
self.show_bollinger_bands = False
|
||||
self.bollinger_bands_source = None
|
||||
self.update_range()
|
||||
|
||||
def set_color(self, color):
|
||||
self.color = color
|
||||
if self.line:
|
||||
self.line.glyph.line_color = color
|
||||
if self.bands:
|
||||
self.bands.glyph.fill_color = color
|
||||
|
||||
def plot_line(self):
|
||||
global current_color
|
||||
self.set_color(Dark2[8][current_color])
|
||||
current_color = (current_color + 1) % len(Dark2[8])
|
||||
if self.has_bollinger_bands:
|
||||
self.set_bands_source()
|
||||
self.create_bands()
|
||||
self.line = self.plot.line('index', self.mean_signal, source=self.bokeh_source,
|
||||
line_color=self.color, line_width=2)
|
||||
# self.scatter = self.plot.scatter('index', self.mean_signal, source=self.bokeh_source)
|
||||
self.line.visible = True
|
||||
|
||||
def set_selected(self, val):
|
||||
if self.selected != val:
|
||||
self.selected = val
|
||||
if self.line:
|
||||
# self.set_color(Dark2[8][current_color])
|
||||
# current_color = (current_color + 1) % len(Dark2[8])
|
||||
self.line.visible = self.selected
|
||||
if self.bands:
|
||||
self.bands.visible = self.selected and self.show_bollinger_bands
|
||||
elif self.selected:
|
||||
# lazy plotting - plot only when selected for the first time
|
||||
self.plot_line()
|
||||
|
||||
def set_dash(self, dash):
|
||||
self.line.glyph.line_dash = dash
|
||||
|
||||
def create_bands(self):
|
||||
self.bands = self.plot.patch(x='band_x', y='band_y', source=self.bollinger_bands_source,
|
||||
color=self.color, fill_alpha=0.4, alpha=0.1, line_width=0)
|
||||
self.bands.visible = self.show_bollinger_bands
|
||||
# self.min_line = plot.line('index', self.min_signal, source=self.bokeh_source,
|
||||
# line_color=self.color, line_width=3, line_dash="4 4")
|
||||
# self.max_line = plot.line('index', self.max_signal, source=self.bokeh_source,
|
||||
# line_color=self.color, line_width=3, line_dash="4 4")
|
||||
# self.min_line.visible = self.show_bollinger_bands
|
||||
# self.max_line.visible = self.show_bollinger_bands
|
||||
|
||||
def set_bands_source(self):
|
||||
x_ticks = self.bokeh_source.data['index']
|
||||
mean_values = self.bokeh_source.data[self.mean_signal]
|
||||
stdev_values = self.bokeh_source.data[self.stdev_signal]
|
||||
band_x = np.append(x_ticks, x_ticks[::-1])
|
||||
band_y = np.append(mean_values - stdev_values, mean_values[::-1] + stdev_values[::-1])
|
||||
source_data = {'band_x': band_x, 'band_y': band_y}
|
||||
if self.bollinger_bands_source:
|
||||
self.bollinger_bands_source.data = source_data
|
||||
else:
|
||||
self.bollinger_bands_source = ColumnDataSource(source_data)
|
||||
|
||||
def change_bollinger_bands_state(self, new_state):
|
||||
self.show_bollinger_bands = new_state
|
||||
if self.bands and self.selected:
|
||||
self.bands.visible = new_state
|
||||
# self.min_line.visible = new_state
|
||||
# self.max_line.visible = new_state
|
||||
|
||||
def update_range(self):
|
||||
self.min_val = np.min(self.bokeh_source.data[self.mean_signal])
|
||||
self.max_val = np.max(self.bokeh_source.data[self.mean_signal])
|
||||
|
||||
def set_axis(self, axis):
|
||||
self.axis = axis
|
||||
if not self.line:
|
||||
self.plot_line()
|
||||
self.line.visible = False
|
||||
self.line.y_range_name = axis
|
||||
|
||||
def toggle_axis(self):
|
||||
if self.axis == 'default':
|
||||
self.set_axis('secondary')
|
||||
else:
|
||||
self.set_axis('default')
|
||||
63
rl_coach/dashboard_components/signals_file.py
Normal file
63
rl_coach/dashboard_components/signals_file.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import os
|
||||
from os.path import basename
|
||||
|
||||
import pandas as pd
|
||||
from pandas.errors import EmptyDataError
|
||||
|
||||
from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
|
||||
from rl_coach.dashboard_components.globals import x_axis_options
|
||||
from rl_coach.utils import break_file_path
|
||||
|
||||
|
||||
class SignalsFile(SignalsFileBase):
|
||||
def __init__(self, csv_path, load=True, plot=None, use_dir_name=False):
|
||||
super().__init__(plot)
|
||||
self.use_dir_name = use_dir_name
|
||||
self.full_csv_path = csv_path
|
||||
self.dir, self.filename, _ = break_file_path(csv_path)
|
||||
|
||||
if use_dir_name:
|
||||
parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_path), '..'))
|
||||
if len(os.listdir(parent_directory_path)) == 1:
|
||||
# get the parent directory name (since the current directory is the timestamp directory)
|
||||
self.dir = parent_directory_path
|
||||
self.filename = basename(self.dir)
|
||||
else:
|
||||
# get the common directory for all the experiments
|
||||
self.dir = os.path.dirname(csv_path)
|
||||
self.filename = "{}/{}".format(basename(parent_directory_path), basename(self.dir))
|
||||
|
||||
if load:
|
||||
self.load()
|
||||
# this helps set the correct x axis
|
||||
self.change_averaging_window(1, force=True)
|
||||
|
||||
def load_csv(self, idx=None, result=None):
|
||||
# load csv and fix sparse data.
|
||||
# csv can be in the middle of being written so we use try - except
|
||||
new_csv = None
|
||||
while new_csv is None:
|
||||
try:
|
||||
new_csv = pd.read_csv(self.full_csv_path)
|
||||
break
|
||||
except EmptyDataError:
|
||||
new_csv = None
|
||||
continue
|
||||
|
||||
new_csv['Wall-Clock Time'] /= 60.
|
||||
new_csv = new_csv.interpolate()
|
||||
# remove signals which don't contain any values
|
||||
for k, v in new_csv.isna().all().items():
|
||||
if v and k not in x_axis_options:
|
||||
del new_csv[k]
|
||||
new_csv.fillna(value=0, inplace=True)
|
||||
|
||||
self.csv = new_csv
|
||||
|
||||
self.last_modified = os.path.getmtime(self.full_csv_path)
|
||||
|
||||
if idx is not None:
|
||||
result[idx] = (self.csv, self.last_modified)
|
||||
|
||||
def file_was_modified_on_disk(self):
|
||||
return self.last_modified != os.path.getmtime(self.full_csv_path)
|
||||
129
rl_coach/dashboard_components/signals_file_base.py
Normal file
129
rl_coach/dashboard_components/signals_file_base.py
Normal file
@@ -0,0 +1,129 @@
|
||||
import numpy as np
|
||||
from bokeh.models import ColumnDataSource
|
||||
|
||||
from rl_coach.dashboard_components.signals import Signal
|
||||
from rl_coach.dashboard_components.globals import x_axis, x_axis_options, show_spinner
|
||||
|
||||
|
||||
class SignalsFileBase:
|
||||
def __init__(self, plot):
|
||||
self.plot = plot
|
||||
self.full_csv_path = ""
|
||||
self.dir = ""
|
||||
self.filename = ""
|
||||
self.signals_averaging_window = 1
|
||||
self.show_bollinger_bands = False
|
||||
self.csv = None
|
||||
self.bokeh_source = None
|
||||
self.bokeh_source_orig = None
|
||||
self.last_modified = None
|
||||
self.signals = {}
|
||||
self.separate_files = False
|
||||
self.last_reload_data_fix = False
|
||||
|
||||
def load_csv(self):
|
||||
pass
|
||||
|
||||
def update_x_axis_index(self):
|
||||
global x_axis
|
||||
self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis[0]]
|
||||
self.bokeh_source.data['index'] = self.bokeh_source.data[x_axis[0]]
|
||||
|
||||
def toggle_y_axis(self, signal_name=None):
|
||||
if signal_name and signal_name in self.signals.keys():
|
||||
self.signals[signal_name].toggle_axis()
|
||||
else:
|
||||
for signal in self.signals.values():
|
||||
if signal.selected:
|
||||
signal.toggle_axis()
|
||||
|
||||
def update_source_and_signals(self):
|
||||
# create bokeh data sources
|
||||
self.bokeh_source_orig = ColumnDataSource(self.csv)
|
||||
|
||||
if self.bokeh_source is None:
|
||||
self.bokeh_source = ColumnDataSource(self.csv)
|
||||
self.update_x_axis_index()
|
||||
else:
|
||||
self.update_x_axis_index()
|
||||
# smooth the data if necessary
|
||||
self.change_averaging_window(self.signals_averaging_window, force=True)
|
||||
|
||||
# create all the signals
|
||||
if len(self.signals.keys()) == 0:
|
||||
self.signals = {}
|
||||
unique_signal_names = []
|
||||
for name in self.csv.columns:
|
||||
if len(name.split('/')) == 1:
|
||||
unique_signal_names.append(name)
|
||||
else:
|
||||
unique_signal_names.append('/'.join(name.split('/')[:-1]))
|
||||
unique_signal_names = list(set(unique_signal_names))
|
||||
for signal_name in unique_signal_names:
|
||||
self.signals[signal_name] = Signal(signal_name, self, self.plot)
|
||||
|
||||
def load(self):
|
||||
self.load_csv()
|
||||
self.update_source_and_signals()
|
||||
|
||||
def reload_data(self):
|
||||
# this function is a workaround to reload the data of all the signals
|
||||
# if the data doesn't change, bokeh does not refresh the line
|
||||
temp_data = self.bokeh_source.data.copy()
|
||||
for col in self.bokeh_source.data.keys():
|
||||
if not self.last_reload_data_fix:
|
||||
temp_data[col] = temp_data[col][:-1]
|
||||
self.last_reload_data_fix = not self.last_reload_data_fix
|
||||
self.bokeh_source.data = temp_data
|
||||
|
||||
def change_averaging_window(self, new_size, force=False, signals=None):
|
||||
if force or self.signals_averaging_window != new_size:
|
||||
self.signals_averaging_window = new_size
|
||||
win = np.ones(new_size) / new_size
|
||||
temp_data = self.bokeh_source_orig.data.copy()
|
||||
for col in self.bokeh_source.data.keys():
|
||||
if col == 'index' or col in x_axis_options \
|
||||
or (signals and not any(col in signal for signal in signals)):
|
||||
temp_data[col] = temp_data[col][:-new_size]
|
||||
continue
|
||||
temp_data[col] = np.convolve(self.bokeh_source_orig.data[col], win, mode='same')[:-new_size]
|
||||
self.bokeh_source.data = temp_data
|
||||
|
||||
# smooth bollinger bands
|
||||
for signal in self.signals.values():
|
||||
if signal.has_bollinger_bands:
|
||||
signal.set_bands_source()
|
||||
|
||||
def hide_all_signals(self):
|
||||
for signal_name in self.signals.keys():
|
||||
self.set_signal_selection(signal_name, False)
|
||||
|
||||
def set_signal_selection(self, signal_name, val):
|
||||
self.signals[signal_name].set_selected(val)
|
||||
|
||||
def change_bollinger_bands_state(self, new_state):
|
||||
self.show_bollinger_bands = new_state
|
||||
for signal in self.signals.values():
|
||||
signal.change_bollinger_bands_state(new_state)
|
||||
|
||||
def file_was_modified_on_disk(self):
|
||||
pass
|
||||
|
||||
def get_range_of_selected_signals_on_axis(self, axis, selected_signal=None):
|
||||
max_val = -float('inf')
|
||||
min_val = float('inf')
|
||||
for signal in self.signals.values():
|
||||
if (selected_signal and signal.name == selected_signal) or (signal.selected and signal.axis == axis):
|
||||
max_val = max(max_val, signal.max_val)
|
||||
min_val = min(min_val, signal.min_val)
|
||||
return min_val, max_val
|
||||
|
||||
def get_selected_signals(self):
|
||||
signals = []
|
||||
for signal in self.signals.values():
|
||||
if signal.selected:
|
||||
signals.append(signal)
|
||||
return signals
|
||||
|
||||
def show_files_separately(self, val):
|
||||
pass
|
||||
192
rl_coach/dashboard_components/signals_files_group.py
Normal file
192
rl_coach/dashboard_components/signals_files_group.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import os
|
||||
from multiprocessing import Process, Manager
|
||||
from os.path import basename
|
||||
|
||||
import pandas as pd
|
||||
from rl_coach.dashboard_components.globals import x_axis_options, add_directory_csv_files, show_spinner, x_axis
|
||||
from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
|
||||
|
||||
from rl_coach.dashboard_components.signals_file import SignalsFile
|
||||
|
||||
|
||||
class SignalsFilesGroup(SignalsFileBase):
|
||||
def __init__(self, csv_paths, plot=None):
|
||||
super().__init__(plot)
|
||||
self.full_csv_paths = csv_paths
|
||||
self.signals_files = []
|
||||
if len(csv_paths) == 1 and os.path.isdir(csv_paths[0]):
|
||||
self.signals_files = [SignalsFile(str(file), load=False, plot=plot) for file in add_directory_csv_files(csv_paths[0])]
|
||||
else:
|
||||
for csv_path in csv_paths:
|
||||
if os.path.isdir(csv_path):
|
||||
self.signals_files.append(SignalsFilesGroup(add_directory_csv_files(csv_path), plot=plot))
|
||||
else:
|
||||
self.signals_files.append(SignalsFile(str(csv_path), load=False, plot=plot))
|
||||
parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_paths[0]), '..'))
|
||||
|
||||
if len(os.listdir(parent_directory_path)) == 1:
|
||||
# get the parent directory name (since the current directory is the timestamp directory)
|
||||
self.dir = parent_directory_path
|
||||
else:
|
||||
# get the common directory for all the experiments
|
||||
self.dir = os.path.dirname('/'.join(os.path.commonprefix(csv_paths).split('/')[:-1]) + '/')
|
||||
|
||||
self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files))
|
||||
|
||||
self.signal_files_need_update = False
|
||||
|
||||
self.load()
|
||||
|
||||
def load_csv(self):
|
||||
global x_axis
|
||||
# load the csv's for all workers
|
||||
processes = []
|
||||
results = Manager().dict()
|
||||
corrupted_files_idx = []
|
||||
for idx, signal_file in enumerate(self.signals_files):
|
||||
if not isinstance(signal_file, SignalsFilesGroup):
|
||||
processes.append(Process(target=signal_file.load_csv, args=(idx, results)))
|
||||
processes[-1].start()
|
||||
[p.join() for p in processes]
|
||||
|
||||
# load csv's for SignalsFilesGroup serially for now. TODO: we should later parallelize this as well.
|
||||
for idx, signal_file in enumerate(self.signals_files):
|
||||
if isinstance(signal_file, SignalsFilesGroup):
|
||||
signal_file.load_csv()
|
||||
|
||||
for idx, signal_file in enumerate(self.signals_files):
|
||||
if len(list(results.keys())) > 0:
|
||||
signal_file.csv, signal_file.last_modified = results[idx]
|
||||
if not all(option in signal_file.csv.keys() for option in x_axis_options):
|
||||
print("Warning: {} file seems to be corrupted and does contain the necessary columns "
|
||||
"and will not be rendered".format(signal_file.filename))
|
||||
corrupted_files_idx.append(idx)
|
||||
|
||||
# remove corrupted worker files
|
||||
for file_idx in corrupted_files_idx:
|
||||
del self.signals_files[file_idx]
|
||||
|
||||
# get the stats of all the columns
|
||||
if len(self.signals_files) > 1:
|
||||
transformed_signals_files = []
|
||||
subsampling = None
|
||||
for idx in range(len(self.signals_files)):
|
||||
transformed_signals_files.append(self.signals_files[idx].csv.copy(deep=True))
|
||||
|
||||
# change the index to be the currently selected x axis
|
||||
transformed_signals_files[-1].index = transformed_signals_files[-1][x_axis[0]]
|
||||
|
||||
# remove all duplicate index rows
|
||||
transformed_signals_files[-1] = transformed_signals_files[-1][~transformed_signals_files[-1].index.duplicated()]
|
||||
|
||||
# fill up missing row indices. we are going to take the mean over the group and we want to make sure
|
||||
# the entire group has some value for every possible index.
|
||||
num_rows = int(transformed_signals_files[-1].index.values[-1])
|
||||
transformed_signals_files[-1] = transformed_signals_files[-1].reindex(range(num_rows))
|
||||
transformed_signals_files[-1].interpolate(inplace=True)
|
||||
|
||||
# sub sample the csv to max of 5000 indices (do the same subsampling to all files)
|
||||
if subsampling is None:
|
||||
subsampling = max(1, num_rows // 5000)
|
||||
transformed_signals_files[-1] = transformed_signals_files[-1].iloc[::subsampling, :]
|
||||
|
||||
csv_group = pd.concat([signals_file for signals_file in transformed_signals_files])
|
||||
columns_to_remove = [s for s in csv_group.columns if '/Stdev' in s] + \
|
||||
[s for s in csv_group.columns if '/Min' in s] + \
|
||||
[s for s in csv_group.columns if '/Max' in s]
|
||||
for col in columns_to_remove:
|
||||
del csv_group[col]
|
||||
csv_group = csv_group.groupby(csv_group.index)
|
||||
self.csv_mean = csv_group.mean()
|
||||
self.csv_mean.columns = [s + '/Mean' for s in self.csv_mean.columns]
|
||||
self.csv_stdev = csv_group.std()
|
||||
self.csv_stdev.columns = [s + '/Stdev' for s in self.csv_stdev.columns]
|
||||
self.csv_min = csv_group.min()
|
||||
self.csv_min.columns = [s + '/Min' for s in self.csv_min.columns]
|
||||
self.csv_max = csv_group.max()
|
||||
self.csv_max.columns = [s + '/Max' for s in self.csv_max.columns]
|
||||
|
||||
# get the indices from the file with the least number of indices and which is not an evaluation worker
|
||||
file_with_min_indices = transformed_signals_files[0]
|
||||
for signals_file in transformed_signals_files:
|
||||
if signals_file.shape[0] < file_with_min_indices.shape[0] and \
|
||||
'Training reward' in signals_file.keys():
|
||||
file_with_min_indices = signals_file
|
||||
self.index_columns = file_with_min_indices[x_axis_options]
|
||||
|
||||
# concat the stats and the indices columns
|
||||
num_rows = file_with_min_indices.shape[0]
|
||||
self.csv = pd.concat([self.index_columns, self.csv_mean.head(num_rows), self.csv_stdev.head(num_rows),
|
||||
self.csv_min.head(num_rows), self.csv_max.head(num_rows)], axis=1)
|
||||
|
||||
# remove the stat columns for the indices columns
|
||||
columns_to_remove = [s + '/Mean' for s in x_axis_options] + \
|
||||
[s + '/Stdev' for s in x_axis_options] + \
|
||||
[s + '/Min' for s in x_axis_options] + \
|
||||
[s + '/Max' for s in x_axis_options]
|
||||
for col in columns_to_remove:
|
||||
if col in self.csv.keys():
|
||||
del self.csv[col]
|
||||
else: # This is a group of a single file
|
||||
self.csv = self.signals_files[0].csv
|
||||
|
||||
# remove NaNs
|
||||
self.csv.fillna(value=0, inplace=True) # removing this line will make bollinger bands fail
|
||||
for key in self.csv.keys():
|
||||
if 'Stdev' in key and 'Evaluation' not in key:
|
||||
self.csv[key] = self.csv[key].fillna(value=0)
|
||||
|
||||
self.signal_files_need_update = True
|
||||
|
||||
def reload_data(self):
|
||||
SignalsFileBase.reload_data(self)
|
||||
|
||||
def update_x_axis_index(self):
|
||||
SignalsFileBase.update_x_axis_index(self)
|
||||
|
||||
# update the x axis for the bollinger bands
|
||||
for signal in self.signals.values():
|
||||
if signal.has_bollinger_bands:
|
||||
signal.set_bands_source()
|
||||
|
||||
def toggle_y_axis(self, signal_name=None):
|
||||
for signal in self.signals.values():
|
||||
if signal.selected:
|
||||
signal.toggle_axis()
|
||||
|
||||
def change_averaging_window(self, new_size, force=False, signals=None):
|
||||
SignalsFileBase.change_averaging_window(self, new_size, force, signals)
|
||||
|
||||
def set_signal_selection(self, signal_name, val):
|
||||
self.show_files_separately(self.separate_files)
|
||||
SignalsFileBase.set_signal_selection(self, signal_name, val)
|
||||
|
||||
def file_was_modified_on_disk(self):
|
||||
for signal_file in self.signals_files:
|
||||
if signal_file.file_was_modified_on_disk():
|
||||
return True
|
||||
return False
|
||||
|
||||
def show_files_separately(self, val):
|
||||
self.separate_files = val
|
||||
|
||||
# lazy updating of the signals of each of the workers
|
||||
if self.separate_files and self.signal_files_need_update:
|
||||
for signal_file in self.signals_files:
|
||||
signal_file.update_source_and_signals()
|
||||
self.signal_files_need_update = False
|
||||
|
||||
for signal in self.signals.values():
|
||||
if signal.selected:
|
||||
if val:
|
||||
signal.set_dash("4 4")
|
||||
else:
|
||||
signal.set_dash("")
|
||||
for signal_file in self.signals_files:
|
||||
try:
|
||||
if val:
|
||||
signal_file.set_signal_selection(signal.name, signal.selected)
|
||||
else:
|
||||
signal_file.set_signal_selection(signal.name, False)
|
||||
except:
|
||||
pass
|
||||
219
rl_coach/dashboard_components/spinner.css
Normal file
219
rl_coach/dashboard_components/spinner.css
Normal file
@@ -0,0 +1,219 @@
|
||||
/* based on https://codepen.io/widmr/pen/tklqx by Anreas Widmer */
|
||||
|
||||
.spinner {
|
||||
font-size: 80px;
|
||||
width: 1em;
|
||||
height: 1em;
|
||||
position: fixed;
|
||||
left: 40%;
|
||||
top: 20%;
|
||||
z-index: 9999;
|
||||
margin: 100px auto;
|
||||
border-radius: 50%;
|
||||
list-style: none;
|
||||
}
|
||||
|
||||
.spinner li {
|
||||
position: absolute;
|
||||
width: .2em;
|
||||
height: .2em;
|
||||
border-radius: 50%;
|
||||
}
|
||||
|
||||
.spinner li:nth-child(1) {
|
||||
left: 50%;
|
||||
top: 0;
|
||||
margin: 0 0 0 -.1em;
|
||||
background: #00C176;
|
||||
-webkit-transform-origin: 50% 250%;
|
||||
-moz-transform-origin: 50% 250%;
|
||||
-ms-transform-origin: 50% 250%;
|
||||
-o-transform-origin: 50% 250%;
|
||||
transform-origin: 50% 250%;
|
||||
-webkit-animation:
|
||||
rota 1.13s linear infinite,
|
||||
opa 3.67s ease-in-out infinite alternate;
|
||||
-moz-animation:
|
||||
rota 1.13s linear infinite,
|
||||
opa 3.67s ease-in-out infinite alternate;
|
||||
-ms-animation:
|
||||
rota 1.13s linear infinite,
|
||||
opa 3.67s ease-in-out infinite alternate;
|
||||
-o-animation:
|
||||
rota 1.13s linear infinite,
|
||||
opa 3.67s ease-in-out infinite alternate;
|
||||
animation:
|
||||
rota 1.13s linear infinite,
|
||||
opa 3.67s ease-in-out infinite alternate;
|
||||
}
|
||||
|
||||
.spinner li:nth-child(2) {
|
||||
top: 50%;
|
||||
right: 0;
|
||||
margin: -.1em 0 0 0;
|
||||
background: #FF003C;
|
||||
-webkit-transform-origin: -150% 50%;
|
||||
-moz-transform-origin: -150% 50%;
|
||||
-ms-transform-origin: -150% 50%;
|
||||
-o-transform-origin: -150% 50%;
|
||||
transform-origin: -150% 50%;
|
||||
-webkit-animation:
|
||||
rota 1.86s linear infinite,
|
||||
opa 4.29s ease-in-out infinite alternate;
|
||||
-moz-animation:
|
||||
rota 1.86s linear infinite,
|
||||
opa 4.29s ease-in-out infinite alternate;
|
||||
-ms-animation:
|
||||
rota 1.86s linear infinite,
|
||||
opa 4.29s ease-in-out infinite alternate;
|
||||
-o-animation:
|
||||
rota 1.86s linear infinite,
|
||||
opa 4.29s ease-in-out infinite alternate;
|
||||
animation:
|
||||
rota 1.86s linear infinite,
|
||||
opa 4.29s ease-in-out infinite alternate;
|
||||
}
|
||||
|
||||
.spinner li:nth-child(3) {
|
||||
left: 50%;
|
||||
bottom: 0;
|
||||
margin: 0 0 0 -.1em;
|
||||
background: #FABE28;
|
||||
-webkit-transform-origin: 50% -150%;
|
||||
-moz-transform-origin: 50% -150%;
|
||||
-ms-transform-origin: 50% -150%;
|
||||
-o-transform-origin: 50% -150%;
|
||||
transform-origin: 50% -150%;
|
||||
-webkit-animation:
|
||||
rota 1.45s linear infinite,
|
||||
opa 5.12s ease-in-out infinite alternate;
|
||||
-moz-animation:
|
||||
rota 1.45s linear infinite,
|
||||
opa 5.12s ease-in-out infinite alternate;
|
||||
-ms-animation:
|
||||
rota 1.45s linear infinite,
|
||||
opa 5.12s ease-in-out infinite alternate;
|
||||
-o-animation:
|
||||
rota 1.45s linear infinite,
|
||||
opa 5.12s ease-in-out infinite alternate;
|
||||
animation:
|
||||
rota 1.45s linear infinite,
|
||||
opa 5.12s ease-in-out infinite alternate;
|
||||
}
|
||||
|
||||
.spinner li:nth-child(4) {
|
||||
top: 50%;
|
||||
left 0;
|
||||
margin: -.1em 0 0 0;
|
||||
background: #88C100;
|
||||
-webkit-transform-origin: 250% 50%;
|
||||
-moz-transform-origin: 250% 50%;
|
||||
-ms-transform-origin: 250% 50%;
|
||||
-o-transform-origin: 250% 50%;
|
||||
transform-origin: 250% 50%;
|
||||
-webkit-animation:
|
||||
rota 1.72s linear infinite,
|
||||
opa 5.25s ease-in-out infinite alternate;
|
||||
-moz-animation:
|
||||
rota 1.72s linear infinite,
|
||||
opa 5.25s ease-in-out infinite alternate;
|
||||
-ms-animation:
|
||||
rota 1.72s linear infinite,
|
||||
opa 5.25s ease-in-out infinite alternate;
|
||||
-o-animation:
|
||||
rota 1.72s linear infinite,
|
||||
opa 5.25s ease-in-out infinite alternate;
|
||||
animation:
|
||||
rota 1.72s linear infinite,
|
||||
opa 5.25s ease-in-out infinite alternate;
|
||||
}
|
||||
|
||||
@-webkit-keyframes rota {
|
||||
to { -webkit-transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
@-moz-keyframes rota {
|
||||
to { -moz-transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
@-ms-keyframes rota {
|
||||
to { -ms-transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
@-o-keyframes rota {
|
||||
to { -o-transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
@keyframes rota {
|
||||
to { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
@-webkit-keyframes opa {
|
||||
12.0% { opacity: 0.80; }
|
||||
19.5% { opacity: 0.88; }
|
||||
37.2% { opacity: 0.64; }
|
||||
40.5% { opacity: 0.52; }
|
||||
52.7% { opacity: 0.69; }
|
||||
60.2% { opacity: 0.60; }
|
||||
66.6% { opacity: 0.52; }
|
||||
70.0% { opacity: 0.63; }
|
||||
79.9% { opacity: 0.60; }
|
||||
84.2% { opacity: 0.75; }
|
||||
91.0% { opacity: 0.87; }
|
||||
}
|
||||
|
||||
@-moz-keyframes opa {
|
||||
12.0% { opacity: 0.80; }
|
||||
19.5% { opacity: 0.88; }
|
||||
37.2% { opacity: 0.64; }
|
||||
40.5% { opacity: 0.52; }
|
||||
52.7% { opacity: 0.69; }
|
||||
60.2% { opacity: 0.60; }
|
||||
66.6% { opacity: 0.52; }
|
||||
70.0% { opacity: 0.63; }
|
||||
79.9% { opacity: 0.60; }
|
||||
84.2% { opacity: 0.75; }
|
||||
91.0% { opacity: 0.87; }
|
||||
}
|
||||
|
||||
@-ms-keyframes opa {
|
||||
12.0% { opacity: 0.80; }
|
||||
19.5% { opacity: 0.88; }
|
||||
37.2% { opacity: 0.64; }
|
||||
40.5% { opacity: 0.52; }
|
||||
52.7% { opacity: 0.69; }
|
||||
60.2% { opacity: 0.60; }
|
||||
66.6% { opacity: 0.52; }
|
||||
70.0% { opacity: 0.63; }
|
||||
79.9% { opacity: 0.60; }
|
||||
84.2% { opacity: 0.75; }
|
||||
91.0% { opacity: 0.87; }
|
||||
}
|
||||
|
||||
@-o-keyframes opa {
|
||||
12.0% { opacity: 0.80; }
|
||||
19.5% { opacity: 0.88; }
|
||||
37.2% { opacity: 0.64; }
|
||||
40.5% { opacity: 0.52; }
|
||||
52.7% { opacity: 0.69; }
|
||||
60.2% { opacity: 0.60; }
|
||||
66.6% { opacity: 0.52; }
|
||||
70.0% { opacity: 0.63; }
|
||||
79.9% { opacity: 0.60; }
|
||||
84.2% { opacity: 0.75; }
|
||||
91.0% { opacity: 0.87; }
|
||||
}
|
||||
|
||||
@keyframes opa {
|
||||
12.0% { opacity: 0.80; }
|
||||
19.5% { opacity: 0.88; }
|
||||
37.2% { opacity: 0.64; }
|
||||
40.5% { opacity: 0.52; }
|
||||
52.7% { opacity: 0.69; }
|
||||
60.2% { opacity: 0.60; }
|
||||
66.6% { opacity: 0.52; }
|
||||
70.0% { opacity: 0.63; }
|
||||
79.9% { opacity: 0.60; }
|
||||
84.2% { opacity: 0.75; }
|
||||
91.0% { opacity: 0.87; }
|
||||
}
|
||||
0
rl_coach/datasets/README.md
Normal file
0
rl_coach/datasets/README.md
Normal file
BIN
rl_coach/datasets/doom_basic.tar.gz
Normal file
BIN
rl_coach/datasets/doom_basic.tar.gz
Normal file
Binary file not shown.
BIN
rl_coach/datasets/montezuma_revenge.tar.gz
Normal file
BIN
rl_coach/datasets/montezuma_revenge.tar.gz
Normal file
Binary file not shown.
77
rl_coach/debug_utils.py
Normal file
77
rl_coach/debug_utils.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import math
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from rl_coach.filters.observation.observation_stacking_filter import LazyStack
|
||||
|
||||
|
||||
def show_observation_stack(stack, channels_last=True, show=True, force_num_rows=None, row_to_update=0):
|
||||
if isinstance(stack, LazyStack):
|
||||
stack = np.array(stack)
|
||||
if isinstance(stack, list): # is list
|
||||
stack_size = len(stack)
|
||||
elif len(stack.shape) == 3:
|
||||
stack_size = stack.shape[0] # is numpy array
|
||||
elif len(stack.shape) == 4:
|
||||
stack_size = stack.shape[1] # ignore batch dimension
|
||||
stack = stack[0]
|
||||
else:
|
||||
raise ValueError("The observation stack must be a list, a numpy array or a LazyStack object")
|
||||
|
||||
if channels_last:
|
||||
stack = np.transpose(stack, (2, 0, 1))
|
||||
stack_size = stack.shape[0]
|
||||
|
||||
max_cols = 10
|
||||
if force_num_rows:
|
||||
rows = force_num_rows
|
||||
else:
|
||||
rows = math.ceil(stack_size / max_cols)
|
||||
cols = max_cols if stack_size > max_cols else stack_size
|
||||
|
||||
for i in range(stack_size):
|
||||
plt.subplot(rows, cols, row_to_update * cols + i + 1)
|
||||
plt.imshow(stack[i], cmap='gray')
|
||||
|
||||
if show:
|
||||
plt.show()
|
||||
|
||||
|
||||
def show_diff_between_two_observations(observation1, observation2):
|
||||
plt.imshow(observation1 - observation2, cmap='gray')
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_grayscale_observation(observation):
|
||||
plt.imshow(observation, cmap='gray')
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_episode_states(episode_transitions, state_variable: str='state', observation_index_in_stack: int=0):
|
||||
observations = []
|
||||
for transition in episode_transitions:
|
||||
observations.append(np.array(getattr(transition, state_variable)['observation'])[..., observation_index_in_stack])
|
||||
show_observation_stack(observations, False)
|
||||
|
||||
|
||||
def plot_list_of_observation_stacks(observation_stacks):
|
||||
for idx, stack in enumerate(observation_stacks):
|
||||
show_observation_stack(stack['observation'], True, False,
|
||||
force_num_rows=len(observation_stacks), row_to_update=idx)
|
||||
plt.show()
|
||||
112
rl_coach/environments/CarlaSettings.ini
Normal file
112
rl_coach/environments/CarlaSettings.ini
Normal file
@@ -0,0 +1,112 @@
|
||||
; Example of settings file for CARLA.
|
||||
;
|
||||
; This file can be loaded with the Python client to be sent to the server. It
|
||||
; defines the parameters to be used when requesting a new episode.
|
||||
;
|
||||
; Note that server specific variables are only loaded when launching the
|
||||
; simulator. Use it with `./CarlaUE4.sh -carla-settings=Path/To/This/File`.
|
||||
|
||||
[CARLA/Server]
|
||||
; If set to false, a mock controller will be used instead of waiting for a real
|
||||
; client to connect. (Server only)
|
||||
UseNetworking=false
|
||||
; Ports to use for the server-client communication. This can be overridden by
|
||||
; the command-line switch `-world-port=N`, write and read ports will be set to
|
||||
; N+1 and N+2 respectively. (Server only)
|
||||
WorldPort=2000
|
||||
; Time-out in milliseconds for the networking operations. (Server only)
|
||||
ServerTimeOut=100000000000
|
||||
; In synchronous mode, CARLA waits every frame until the control from the client
|
||||
; is received.
|
||||
SynchronousMode=true
|
||||
; Send info about every non-player agent in the scene every frame, the
|
||||
; information is attached to the measurements message. This includes other
|
||||
; vehicles, pedestrians and traffic signs. Disabled by default to improve
|
||||
; performance.
|
||||
SendNonPlayerAgentsInfo=false
|
||||
|
||||
[CARLA/QualitySettings]
|
||||
; Quality level of the graphics, a lower level makes the simulation run
|
||||
; considerably faster. Available: Low or Epic.
|
||||
QualityLevel=Low
|
||||
|
||||
[CARLA/LevelSettings]
|
||||
; Path of the vehicle class to be used for the player. Leave empty for default.
|
||||
; Paths follow the pattern "/Game/Blueprints/Vehicles/Mustang/Mustang.Mustang_C"
|
||||
PlayerVehicle=
|
||||
; Number of non-player vehicles to be spawned into the level.
|
||||
NumberOfVehicles=15
|
||||
; Number of non-player pedestrians to be spawned into the level.
|
||||
NumberOfPedestrians=30
|
||||
; Index of the weather/lighting presets to use. If negative, the default presets
|
||||
; of the map will be used.
|
||||
WeatherId=1
|
||||
; Seeds for the pseudo-random number generators.
|
||||
SeedVehicles=123456789
|
||||
SeedPedestrians=123456789
|
||||
|
||||
[CARLA/Sensor]
|
||||
; Names of the sensors to be attached to the player, comma-separated, each of
|
||||
; them should be defined in its own subsection.
|
||||
|
||||
; Uncomment next line to add a camera called FrontCamera to the vehicle
|
||||
Sensors=FrontCamera
|
||||
|
||||
; or uncomment next line to add a camera and a Lidar
|
||||
; Sensors=FrontCamera,MyLidar
|
||||
|
||||
; or uncomment next line to add a regular camera and a depth camera
|
||||
; Sensors=FrontCamera,FrontCamera/Depth
|
||||
|
||||
; Now, every camera we added needs to be defined it in its own subsection.
|
||||
[CARLA/Sensor/FrontCamera]
|
||||
; Type of the sensor. The available types are:
|
||||
; * CAMERA A scene capture camera.
|
||||
; * LIDAR_RAY_CAST A Lidar implementation based on ray-casting.
|
||||
SensorType=CAMERA
|
||||
; Post-processing effect to be applied to this camera. Valid values:
|
||||
; * None No effects applied.
|
||||
; * SceneFinal Post-processing present at scene (bloom, fog, etc).
|
||||
; * Depth Depth map ground-truth only.
|
||||
; * SemanticSegmentation Semantic segmentation ground-truth only.
|
||||
PostProcessing=SceneFinal
|
||||
; Size of the captured image in pixels.
|
||||
ImageSizeX=360
|
||||
ImageSizeY=256
|
||||
; Camera (horizontal) field of view in degrees.
|
||||
FOV=90
|
||||
; Position of the camera relative to the car in meters.
|
||||
PositionX=0.20
|
||||
PositionY=0
|
||||
PositionZ=1.30
|
||||
; Rotation of the camera relative to the car in degrees.
|
||||
RotationPitch=8
|
||||
RotationRoll=0
|
||||
RotationYaw=0
|
||||
|
||||
[CARLA/Sensor/FrontCamera/Depth]
|
||||
; The sensor can be defined in a subsection of FrontCamera so it inherits the
|
||||
; values in FrontCamera. This adds a camera similar to FrontCamera but generating
|
||||
; depth map images instead.
|
||||
PostProcessing=Depth
|
||||
|
||||
[CARLA/Sensor/MyLidar]
|
||||
SensorType=LIDAR_RAY_CAST
|
||||
; Number of lasers.
|
||||
Channels=32
|
||||
; Measure distance in meters.
|
||||
Range=50.0
|
||||
; Points generated by all lasers per second.
|
||||
PointsPerSecond=100000
|
||||
; Lidar rotation frequency.
|
||||
RotationFrequency=10
|
||||
; Upper and lower laser angles, positive values means above horizontal line.
|
||||
UpperFOVLimit=10
|
||||
LowerFOVLimit=-30
|
||||
; Position and rotation relative to the vehicle.
|
||||
PositionX=0
|
||||
PositionY=0
|
||||
PositionZ=1.40
|
||||
RotationPitch=0
|
||||
RotationYaw=0
|
||||
RotationRoll=0
|
||||
19
rl_coach/environments/README.md
Normal file
19
rl_coach/environments/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
A custom environment implementation should look like this:
|
||||
|
||||
```bash
|
||||
from coach.filters.input_filter import InputFilter
|
||||
|
||||
class CustomFilter(InputFilter):
|
||||
def __init__(self):
|
||||
...
|
||||
def _filter(self, env_response: EnvResponse) -> EnvResponse:
|
||||
...
|
||||
def _get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace:
|
||||
...
|
||||
def _get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
|
||||
...
|
||||
def _validate_input_observation_space(self, input_observation_space: ObservationSpace):
|
||||
...
|
||||
def _reset(self):
|
||||
...
|
||||
```
|
||||
16
rl_coach/environments/__init__.py
Normal file
16
rl_coach/environments/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
357
rl_coach/environments/carla_environment.py
Normal file
357
rl_coach/environments/carla_environment.py
Normal file
@@ -0,0 +1,357 @@
|
||||
import random
|
||||
import sys
|
||||
from os import path, environ
|
||||
|
||||
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
|
||||
|
||||
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
|
||||
|
||||
try:
|
||||
if 'CARLA_ROOT' in environ:
|
||||
sys.path.append(path.join(environ.get('CARLA_ROOT'), 'PythonClient'))
|
||||
from carla.client import CarlaClient
|
||||
from carla.settings import CarlaSettings
|
||||
from carla.tcp import TCPConnectionError
|
||||
from carla.sensor import Camera
|
||||
from carla.client import VehicleControl
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("CARLA")
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
|
||||
from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, StateSpace, \
|
||||
VectorObservationSpace
|
||||
from rl_coach.utils import get_open_port, force_list
|
||||
from enum import Enum
|
||||
import os
|
||||
import signal
|
||||
from typing import List, Union
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.filters.filter import InputFilter, NoOutputFilter
|
||||
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
|
||||
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
|
||||
import numpy as np
|
||||
|
||||
|
||||
# enum of the available levels and their path
|
||||
class CarlaLevel(Enum):
|
||||
TOWN1 = "/Game/Maps/Town01"
|
||||
TOWN2 = "/Game/Maps/Town02"
|
||||
|
||||
key_map = {
|
||||
'BRAKE': (274,), # down arrow
|
||||
'GAS': (273,), # up arrow
|
||||
'TURN_LEFT': (276,), # left arrow
|
||||
'TURN_RIGHT': (275,), # right arrow
|
||||
'GAS_AND_TURN_LEFT': (273, 276),
|
||||
'GAS_AND_TURN_RIGHT': (273, 275),
|
||||
'BRAKE_AND_TURN_LEFT': (274, 276),
|
||||
'BRAKE_AND_TURN_RIGHT': (274, 275),
|
||||
}
|
||||
|
||||
CarlaInputFilter = InputFilter(is_a_reference_filter=True)
|
||||
CarlaInputFilter.add_observation_filter('forward_camera', 'rescaling',
|
||||
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([128, 180, 3]),
|
||||
high=255)))
|
||||
CarlaInputFilter.add_observation_filter('forward_camera', 'to_grayscale', ObservationRGBToYFilter())
|
||||
CarlaInputFilter.add_observation_filter('forward_camera', 'to_uint8', ObservationToUInt8Filter(0, 255))
|
||||
CarlaInputFilter.add_observation_filter('forward_camera', 'stacking', ObservationStackingFilter(4))
|
||||
|
||||
CarlaOutputFilter = NoOutputFilter()
|
||||
|
||||
|
||||
class CameraTypes(Enum):
|
||||
FRONT = "forward_camera"
|
||||
LEFT = "left_camera"
|
||||
RIGHT = "right_camera"
|
||||
SEGMENTATION = "segmentation"
|
||||
DEPTH = "depth"
|
||||
LIDAR = "lidar"
|
||||
|
||||
|
||||
class CarlaEnvironmentParameters(EnvironmentParameters):
|
||||
class Quality(Enum):
|
||||
LOW = "Low"
|
||||
EPIC = "Epic"
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame_skip = 3 # the frame skip affects the fps of the server directly. fps = 30 / frameskip
|
||||
self.server_height = 512
|
||||
self.server_width = 720
|
||||
self.camera_height = 128
|
||||
self.camera_width = 180
|
||||
self.config = None #'environments/CarlaSettings.ini' # TODO: remove the config to prevent confusion
|
||||
self.level = 'town1'
|
||||
self.quality = self.Quality.LOW
|
||||
self.cameras = [CameraTypes.FRONT]
|
||||
self.weather_id = [1]
|
||||
self.verbose = True
|
||||
self.episode_max_time = 100000 # miliseconds for each episode
|
||||
self.allow_braking = False
|
||||
self.default_input_filter = CarlaInputFilter
|
||||
self.default_output_filter = CarlaOutputFilter
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.carla_environment:CarlaEnvironment'
|
||||
|
||||
|
||||
class CarlaEnvironment(Environment):
|
||||
def __init__(self, level: LevelSelection,
|
||||
seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float],
|
||||
visualization_parameters: VisualizationParameters,
|
||||
server_height: int, server_width: int, camera_height: int, camera_width: int,
|
||||
verbose: bool, config: str, episode_max_time: int,
|
||||
allow_braking: bool, quality: CarlaEnvironmentParameters.Quality,
|
||||
cameras: List[CameraTypes], weather_id: List[int], **kwargs):
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
|
||||
|
||||
# server configuration
|
||||
self.server_height = server_height
|
||||
self.server_width = server_width
|
||||
self.port = get_open_port()
|
||||
self.host = 'localhost'
|
||||
self.map = self.env_id
|
||||
|
||||
# client configuration
|
||||
self.verbose = verbose
|
||||
self.quality = quality
|
||||
self.cameras = cameras
|
||||
self.weather_id = weather_id
|
||||
self.episode_max_time = episode_max_time
|
||||
self.allow_braking = allow_braking
|
||||
self.camera_width = camera_width
|
||||
self.camera_height = camera_height
|
||||
|
||||
# state space
|
||||
self.state_space = StateSpace({
|
||||
"measurements": VectorObservationSpace(4, measurements_names=["forward_speed", "x", "y", "z"])
|
||||
})
|
||||
for camera in self.cameras:
|
||||
self.state_space[camera.value] = ImageObservationSpace(
|
||||
shape=np.array([self.camera_height, self.camera_width, 3]),
|
||||
high=255)
|
||||
|
||||
# setup server settings
|
||||
self.config = config
|
||||
if self.config:
|
||||
# load settings from file
|
||||
with open(self.config, 'r') as fp:
|
||||
self.settings = fp.read()
|
||||
else:
|
||||
# hard coded settings
|
||||
self.settings = CarlaSettings()
|
||||
self.settings.set(
|
||||
SynchronousMode=True,
|
||||
SendNonPlayerAgentsInfo=False,
|
||||
NumberOfVehicles=15,
|
||||
NumberOfPedestrians=30,
|
||||
WeatherId=random.choice(force_list(self.weather_id)),
|
||||
QualityLevel=self.quality.value)
|
||||
self.settings.randomize_seeds()
|
||||
|
||||
self.settings = self._add_cameras(self.settings, self.cameras, self.camera_width, self.camera_height)
|
||||
|
||||
# open the server
|
||||
self.server = self._open_server()
|
||||
|
||||
logging.disable(40)
|
||||
|
||||
# open the client
|
||||
self.game = CarlaClient(self.host, self.port, timeout=99999999)
|
||||
self.game.connect()
|
||||
scene = self.game.load_settings(self.settings)
|
||||
|
||||
# get available start positions
|
||||
positions = scene.player_start_spots
|
||||
self.num_pos = len(positions)
|
||||
self.iterator_start_positions = 0
|
||||
|
||||
# action space
|
||||
self.action_space = BoxActionSpace(shape=2, low=np.array([-1, -1]), high=np.array([1, 1]))
|
||||
|
||||
# human control
|
||||
if self.human_control:
|
||||
# convert continuous action space to discrete
|
||||
self.steering_strength = 0.5
|
||||
self.gas_strength = 1.0
|
||||
self.brake_strength = 0.5
|
||||
self.action_space = PartialDiscreteActionSpaceMap(
|
||||
target_actions=[[0., 0.],
|
||||
[0., -self.steering_strength],
|
||||
[0., self.steering_strength],
|
||||
[self.gas_strength, 0.],
|
||||
[-self.brake_strength, 0],
|
||||
[self.gas_strength, -self.steering_strength],
|
||||
[self.gas_strength, self.steering_strength],
|
||||
[self.brake_strength, -self.steering_strength],
|
||||
[self.brake_strength, self.steering_strength]],
|
||||
target_action_space=self.action_space,
|
||||
descriptions=['NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE',
|
||||
'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT',
|
||||
'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT']
|
||||
)
|
||||
|
||||
# map keyboard keys to actions
|
||||
for idx, action in enumerate(self.action_space.descriptions):
|
||||
for key in key_map.keys():
|
||||
if action == key:
|
||||
self.key_to_action[key_map[key]] = idx
|
||||
|
||||
self.num_speedup_steps = 30
|
||||
|
||||
# measurements
|
||||
self.autopilot = None
|
||||
|
||||
# env initialization
|
||||
self.reset_internal_state(True)
|
||||
|
||||
# render
|
||||
if self.is_rendered:
|
||||
image = self.get_rendered_image()
|
||||
self.renderer.create_screen(image.shape[1], image.shape[0])
|
||||
|
||||
def _add_cameras(self, settings, cameras, camera_width, camera_height):
|
||||
# add a front facing camera
|
||||
if CameraTypes.FRONT in cameras:
|
||||
camera = Camera(CameraTypes.FRONT.value)
|
||||
camera.set_image_size(camera_width, camera_height)
|
||||
camera.set_position(0.2, 0, 1.3)
|
||||
camera.set_rotation(8, 0, 0)
|
||||
settings.add_sensor(camera)
|
||||
|
||||
# add a left facing camera
|
||||
if CameraTypes.LEFT in cameras:
|
||||
camera = Camera(CameraTypes.LEFT.value)
|
||||
camera.set_image_size(camera_width, camera_height)
|
||||
camera.set_position(0.2, 0, 1.3)
|
||||
camera.set_rotation(8, -30, 0)
|
||||
settings.add_sensor(camera)
|
||||
|
||||
# add a right facing camera
|
||||
if CameraTypes.RIGHT in cameras:
|
||||
camera = Camera(CameraTypes.RIGHT.value)
|
||||
camera.set_image_size(camera_width, camera_height)
|
||||
camera.set_position(0.2, 0, 1.3)
|
||||
camera.set_rotation(8, 30, 0)
|
||||
settings.add_sensor(camera)
|
||||
|
||||
# add a front facing depth camera
|
||||
if CameraTypes.DEPTH in cameras:
|
||||
camera = Camera(CameraTypes.DEPTH.value)
|
||||
camera.set_image_size(camera_width, camera_height)
|
||||
camera.set_position(0.2, 0, 1.3)
|
||||
camera.set_rotation(8, 30, 0)
|
||||
camera.PostProcessing = 'Depth'
|
||||
settings.add_sensor(camera)
|
||||
|
||||
# add a front facing semantic segmentation camera
|
||||
if CameraTypes.SEGMENTATION in cameras:
|
||||
camera = Camera(CameraTypes.SEGMENTATION.value)
|
||||
camera.set_image_size(camera_width, camera_height)
|
||||
camera.set_position(0.2, 0, 1.3)
|
||||
camera.set_rotation(8, 30, 0)
|
||||
camera.PostProcessing = 'SemanticSegmentation'
|
||||
settings.add_sensor(camera)
|
||||
|
||||
return settings
|
||||
|
||||
def _open_server(self):
|
||||
# TODO: get experiment path
|
||||
log_path = path.join('./logs/', "CARLA_LOG_{}.txt".format(self.port))
|
||||
with open(log_path, "wb") as out:
|
||||
cmd = [path.join(environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map,
|
||||
"-benchmark", "-carla-server", "-fps={}".format(30 / self.frame_skip),
|
||||
"-world-port={}".format(self.port),
|
||||
"-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height),
|
||||
"-carla-no-hud"]
|
||||
|
||||
if self.config:
|
||||
cmd.append("-carla-settings={}".format(self.config))
|
||||
p = subprocess.Popen(cmd, stdout=out, stderr=out)
|
||||
|
||||
return p
|
||||
|
||||
def _close_server(self):
|
||||
os.killpg(os.getpgid(self.server.pid), signal.SIGKILL)
|
||||
|
||||
def _update_state(self):
|
||||
# get measurements and observations
|
||||
measurements = []
|
||||
while type(measurements) == list:
|
||||
measurements, sensor_data = self.game.read_data()
|
||||
self.state = {}
|
||||
|
||||
for camera in self.cameras:
|
||||
self.state[camera.value] = sensor_data[camera.value].data
|
||||
|
||||
self.location = [measurements.player_measurements.transform.location.x,
|
||||
measurements.player_measurements.transform.location.y,
|
||||
measurements.player_measurements.transform.location.z]
|
||||
|
||||
is_collision = measurements.player_measurements.collision_vehicles != 0 \
|
||||
or measurements.player_measurements.collision_pedestrians != 0 \
|
||||
or measurements.player_measurements.collision_other != 0
|
||||
|
||||
speed_reward = measurements.player_measurements.forward_speed - 1
|
||||
if speed_reward > 30.:
|
||||
speed_reward = 30.
|
||||
self.reward = speed_reward \
|
||||
- (measurements.player_measurements.intersection_otherlane * 5) \
|
||||
- (measurements.player_measurements.intersection_offroad * 5) \
|
||||
- is_collision * 100 \
|
||||
- np.abs(self.control.steer) * 10
|
||||
|
||||
# update measurements
|
||||
self.measurements = [measurements.player_measurements.forward_speed] + self.location
|
||||
self.autopilot = measurements.player_measurements.autopilot_control
|
||||
|
||||
# action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]]
|
||||
# screen.success('REWARD: %.2f, ACTIONS: %s' % (self.reward, action_p))
|
||||
|
||||
if (measurements.game_timestamp >= self.episode_max_time) or is_collision:
|
||||
# screen.success('EPISODE IS DONE. GameTime: {}, Collision: {}'.format(str(measurements.game_timestamp),
|
||||
# str(is_collision)))
|
||||
self.done = True
|
||||
|
||||
self.state['measurements'] = self.measurements
|
||||
|
||||
def _take_action(self, action):
|
||||
self.control = VehicleControl()
|
||||
self.control.throttle = np.clip(action[0], 0, 1)
|
||||
self.control.steer = np.clip(action[1], -1, 1)
|
||||
self.control.brake = np.abs(np.clip(action[0], -1, 0))
|
||||
if not self.allow_braking:
|
||||
self.control.brake = 0
|
||||
self.control.hand_brake = False
|
||||
self.control.reverse = False
|
||||
|
||||
self.game.send_control(self.control)
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False):
|
||||
self.iterator_start_positions += 1
|
||||
if self.iterator_start_positions >= self.num_pos:
|
||||
self.iterator_start_positions = 0
|
||||
|
||||
try:
|
||||
self.game.start_episode(self.iterator_start_positions)
|
||||
except:
|
||||
self.game.connect()
|
||||
self.game.start_episode(self.iterator_start_positions)
|
||||
|
||||
# start the game with some initial speed
|
||||
for i in range(self.num_speedup_steps):
|
||||
self._take_action([1.0, 0])
|
||||
|
||||
def get_rendered_image(self) -> np.ndarray:
|
||||
"""
|
||||
Return a numpy array containing the image that will be rendered to the screen.
|
||||
This can be different from the observation. For example, mujoco's observation is a measurements vector.
|
||||
:return: numpy array containing the image that will be rendered to the screen
|
||||
"""
|
||||
image = [self.state[camera.value] for camera in self.cameras]
|
||||
image = np.vstack(image)
|
||||
return image
|
||||
162
rl_coach/environments/control_suite_environment.py
Normal file
162
rl_coach/environments/control_suite_environment.py
Normal file
@@ -0,0 +1,162 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
|
||||
import random
|
||||
from enum import Enum
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
from dm_control import suite
|
||||
from dm_control.suite.wrappers import pixels
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("DeepMind Control Suite")
|
||||
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
|
||||
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
|
||||
from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, VectorObservationSpace, StateSpace
|
||||
|
||||
|
||||
class ObservationType(Enum):
|
||||
Measurements = 1
|
||||
Image = 2
|
||||
Image_and_Measurements = 3
|
||||
|
||||
|
||||
# Parameters
|
||||
class ControlSuiteEnvironmentParameters(EnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.observation_type = ObservationType.Measurements
|
||||
self.default_input_filter = ControlSuiteInputFilter
|
||||
self.default_output_filter = ControlSuiteOutputFilter
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.control_suite_environment:ControlSuiteEnvironment'
|
||||
|
||||
|
||||
"""
|
||||
ControlSuite Environment Components
|
||||
"""
|
||||
ControlSuiteInputFilter = NoInputFilter()
|
||||
ControlSuiteOutputFilter = NoOutputFilter()
|
||||
|
||||
control_suite_envs = {':'.join(env): ':'.join(env) for env in suite.BENCHMARKING}
|
||||
|
||||
|
||||
# Environment
|
||||
class ControlSuiteEnvironment(Environment):
|
||||
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
|
||||
seed: Union[None, int]=None, human_control: bool=False,
|
||||
observation_type: ObservationType=ObservationType.Measurements,
|
||||
custom_reward_threshold: Union[int, float]=None, **kwargs):
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
|
||||
|
||||
self.observation_type = observation_type
|
||||
|
||||
# load and initialize environment
|
||||
domain_name, task_name = self.env_id.split(":")
|
||||
self.env = suite.load(domain_name=domain_name, task_name=task_name)
|
||||
|
||||
if observation_type != ObservationType.Measurements:
|
||||
self.env = pixels.Wrapper(self.env, pixels_only=observation_type == ObservationType.Image)
|
||||
|
||||
# seed
|
||||
if self.seed is not None:
|
||||
np.random.seed(self.seed)
|
||||
random.seed(self.seed)
|
||||
|
||||
self.state_space = StateSpace({})
|
||||
|
||||
# image observations
|
||||
if observation_type != ObservationType.Measurements:
|
||||
self.state_space['pixels'] = ImageObservationSpace(shape=self.env.observation_spec()['pixels'].shape,
|
||||
high=255)
|
||||
|
||||
# measurements observations
|
||||
if observation_type != ObservationType.Image:
|
||||
measurements_space_size = 0
|
||||
measurements_names = []
|
||||
for observation_space_name, observation_space in self.env.observation_spec().items():
|
||||
if len(observation_space.shape) == 0:
|
||||
measurements_space_size += 1
|
||||
measurements_names.append(observation_space_name)
|
||||
elif len(observation_space.shape) == 1:
|
||||
measurements_space_size += observation_space.shape[0]
|
||||
measurements_names.extend(["{}_{}".format(observation_space_name, i) for i in
|
||||
range(observation_space.shape[0])])
|
||||
self.state_space['measurements'] = VectorObservationSpace(shape=measurements_space_size,
|
||||
measurements_names=measurements_names)
|
||||
|
||||
# actions
|
||||
self.action_space = BoxActionSpace(
|
||||
shape=self.env.action_spec().shape[0],
|
||||
low=self.env.action_spec().minimum,
|
||||
high=self.env.action_spec().maximum
|
||||
)
|
||||
|
||||
# initialize the state by getting a new state from the environment
|
||||
self.reset_internal_state(True)
|
||||
|
||||
# render
|
||||
if self.is_rendered:
|
||||
image = self.get_rendered_image()
|
||||
scale = 1
|
||||
if self.human_control:
|
||||
scale = 2
|
||||
if not self.native_rendering:
|
||||
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
|
||||
|
||||
def _update_state(self):
|
||||
self.state = {}
|
||||
|
||||
if self.observation_type != ObservationType.Measurements:
|
||||
self.pixels = self.last_result.observation['pixels']
|
||||
self.state['pixels'] = self.pixels
|
||||
|
||||
if self.observation_type != ObservationType.Image:
|
||||
self.measurements = np.array([])
|
||||
for sub_observation in self.last_result.observation.values():
|
||||
if isinstance(sub_observation, np.ndarray) and len(sub_observation.shape) == 1:
|
||||
self.measurements = np.concatenate((self.measurements, sub_observation))
|
||||
else:
|
||||
self.measurements = np.concatenate((self.measurements, np.array([sub_observation])))
|
||||
self.state['measurements'] = self.measurements
|
||||
|
||||
self.reward = self.last_result.reward if self.last_result.reward is not None else 0
|
||||
|
||||
self.done = self.last_result.last()
|
||||
|
||||
def _take_action(self, action):
|
||||
if type(self.action_space) == BoxActionSpace:
|
||||
action = self.action_space.clip_action_to_space(action)
|
||||
|
||||
self.last_result = self.env.step(action)
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False):
|
||||
self.last_result = self.env.reset()
|
||||
|
||||
def _render(self):
|
||||
pass
|
||||
|
||||
def get_rendered_image(self):
|
||||
return self.env.physics.render(camera_id=0)
|
||||
39
rl_coach/environments/doom/D2_navigation.cfg
Normal file
39
rl_coach/environments/doom/D2_navigation.cfg
Normal file
@@ -0,0 +1,39 @@
|
||||
# Lines starting with # are treated as comments (or with whitespaces+#).
|
||||
# It doesn't matter if you use capital letters or not.
|
||||
# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
|
||||
|
||||
doom_scenario_path = D2_navigation.wad
|
||||
doom_map = map01
|
||||
|
||||
# Rewards
|
||||
|
||||
# Each step is good for you!
|
||||
living_reward = 1
|
||||
# And death is not!
|
||||
death_penalty = 0
|
||||
|
||||
# Rendering options
|
||||
screen_resolution = RES_160X120
|
||||
screen_format = GRAY8
|
||||
render_hud = false
|
||||
render_crosshair = false
|
||||
render_weapon = false
|
||||
render_decals = false
|
||||
render_particles = false
|
||||
window_visible = false
|
||||
|
||||
# make episodes finish after 2100 actions (tics)
|
||||
episode_timeout = 2100
|
||||
|
||||
# Available buttons
|
||||
available_buttons =
|
||||
{
|
||||
TURN_LEFT
|
||||
TURN_RIGHT
|
||||
MOVE_FORWARD
|
||||
}
|
||||
|
||||
# Game variables that will be in the state
|
||||
available_game_variables = { HEALTH }
|
||||
|
||||
mode = PLAYER
|
||||
BIN
rl_coach/environments/doom/D2_navigation.wad
Normal file
BIN
rl_coach/environments/doom/D2_navigation.wad
Normal file
Binary file not shown.
44
rl_coach/environments/doom/D3_battle.cfg
Normal file
44
rl_coach/environments/doom/D3_battle.cfg
Normal file
@@ -0,0 +1,44 @@
|
||||
# Lines starting with # are treated as comments (or with whitespaces+#).
|
||||
# It doesn't matter if you use capital letters or not.
|
||||
# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
|
||||
|
||||
# modifty these to point to your vizdoom binary and freedoom2.wad
|
||||
doom_scenario_path = D3_battle.wad
|
||||
doom_map = map01
|
||||
|
||||
# Rewards
|
||||
|
||||
living_reward = 0
|
||||
death_penalty = 0
|
||||
|
||||
# Rendering options
|
||||
screen_resolution = RES_320X240
|
||||
screen_format = CRCGCB
|
||||
render_hud = false
|
||||
render_crosshair = true
|
||||
render_weapon = true
|
||||
render_decals = false
|
||||
render_particles = false
|
||||
window_visible = false
|
||||
|
||||
# make episodes finish after 2100 actions (tics)
|
||||
episode_timeout = 2100
|
||||
|
||||
# Available buttons
|
||||
available_buttons =
|
||||
{
|
||||
MOVE_FORWARD
|
||||
MOVE_BACKWARD
|
||||
MOVE_RIGHT
|
||||
MOVE_LEFT
|
||||
TURN_LEFT
|
||||
TURN_RIGHT
|
||||
ATTACK
|
||||
SPEED
|
||||
}
|
||||
|
||||
# Game variables that will be in the state
|
||||
available_game_variables = {AMMO2 HEALTH USER2}
|
||||
|
||||
mode = PLAYER
|
||||
doom_skill = 2
|
||||
BIN
rl_coach/environments/doom/D3_battle.wad
Normal file
BIN
rl_coach/environments/doom/D3_battle.wad
Normal file
Binary file not shown.
229
rl_coach/environments/doom_environment.py
Normal file
229
rl_coach/environments/doom_environment.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
try:
|
||||
import vizdoom
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("ViZDoom")
|
||||
|
||||
import os
|
||||
from enum import Enum
|
||||
from os import path, environ
|
||||
from typing import Union, List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
|
||||
from rl_coach.filters.action.full_discrete_action_space_map import FullDiscreteActionSpaceMap
|
||||
from rl_coach.filters.filter import InputFilter, OutputFilter
|
||||
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
|
||||
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
|
||||
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
|
||||
from rl_coach.spaces import MultiSelectActionSpace, ImageObservationSpace, \
|
||||
VectorObservationSpace, StateSpace
|
||||
|
||||
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
|
||||
|
||||
|
||||
# enum of the available levels and their path
|
||||
class DoomLevel(Enum):
|
||||
BASIC = "basic.cfg"
|
||||
DEFEND = "defend_the_center.cfg"
|
||||
DEATHMATCH = "deathmatch.cfg"
|
||||
MY_WAY_HOME = "my_way_home.cfg"
|
||||
TAKE_COVER = "take_cover.cfg"
|
||||
HEALTH_GATHERING = "health_gathering.cfg"
|
||||
HEALTH_GATHERING_SUPREME_COACH_LOCAL = "D2_navigation.cfg" # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
|
||||
DEFEND_THE_LINE = "defend_the_line.cfg"
|
||||
DEADLY_CORRIDOR = "deadly_corridor.cfg"
|
||||
BATTLE_COACH_LOCAL = "D3_battle.cfg" # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
|
||||
|
||||
key_map = {
|
||||
'NO-OP': 96, # `
|
||||
'ATTACK': 13, # enter
|
||||
'CROUCH': 306, # ctrl
|
||||
'DROP_SELECTED_ITEM': ord("t"),
|
||||
'DROP_SELECTED_WEAPON': ord("t"),
|
||||
'JUMP': 32, # spacebar
|
||||
'LAND': ord("l"),
|
||||
'LOOK_DOWN': 274, # down arrow
|
||||
'LOOK_UP': 273, # up arrow
|
||||
'MOVE_BACKWARD': ord("s"),
|
||||
'MOVE_DOWN': ord("s"),
|
||||
'MOVE_FORWARD': ord("w"),
|
||||
'MOVE_LEFT': 276,
|
||||
'MOVE_RIGHT': 275,
|
||||
'MOVE_UP': ord("w"),
|
||||
'RELOAD': ord("r"),
|
||||
'SELECT_NEXT_WEAPON': ord("q"),
|
||||
'SELECT_PREV_WEAPON': ord("e"),
|
||||
'SELECT_WEAPON0': ord("0"),
|
||||
'SELECT_WEAPON1': ord("1"),
|
||||
'SELECT_WEAPON2': ord("2"),
|
||||
'SELECT_WEAPON3': ord("3"),
|
||||
'SELECT_WEAPON4': ord("4"),
|
||||
'SELECT_WEAPON5': ord("5"),
|
||||
'SELECT_WEAPON6': ord("6"),
|
||||
'SELECT_WEAPON7': ord("7"),
|
||||
'SELECT_WEAPON8': ord("8"),
|
||||
'SELECT_WEAPON9': ord("9"),
|
||||
'SPEED': 304, # shift
|
||||
'STRAFE': 9, # tab
|
||||
'TURN180': ord("u"),
|
||||
'TURN_LEFT': ord("a"), # left arrow
|
||||
'TURN_RIGHT': ord("d"), # right arrow
|
||||
'USE': ord("f"),
|
||||
}
|
||||
|
||||
|
||||
DoomInputFilter = InputFilter(is_a_reference_filter=True)
|
||||
DoomInputFilter.add_observation_filter('observation', 'rescaling',
|
||||
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([60, 76, 3]),
|
||||
high=255)))
|
||||
DoomInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
|
||||
DoomInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
|
||||
DoomInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(3))
|
||||
|
||||
|
||||
DoomOutputFilter = OutputFilter(is_a_reference_filter=True)
|
||||
DoomOutputFilter.add_action_filter('to_discrete', FullDiscreteActionSpaceMap())
|
||||
|
||||
|
||||
class DoomEnvironmentParameters(EnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.default_input_filter = DoomInputFilter
|
||||
self.default_output_filter = DoomOutputFilter
|
||||
self.cameras = [DoomEnvironment.CameraTypes.OBSERVATION]
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.doom_environment:DoomEnvironment'
|
||||
|
||||
|
||||
class DoomEnvironment(Environment):
|
||||
class CameraTypes(Enum):
|
||||
OBSERVATION = ("observation", "screen_buffer")
|
||||
DEPTH = ("depth", "depth_buffer")
|
||||
LABELS = ("labels", "labels_buffer")
|
||||
MAP = ("map", "automap_buffer")
|
||||
|
||||
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
|
||||
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
|
||||
cameras: List[CameraTypes], **kwargs):
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
|
||||
|
||||
self.cameras = cameras
|
||||
|
||||
# load the emulator with the required level
|
||||
self.level = DoomLevel[level.upper()]
|
||||
local_scenarios_path = path.join(os.path.dirname(os.path.realpath(__file__)), 'doom')
|
||||
self.scenarios_dir = local_scenarios_path if 'COACH_LOCAL' in level \
|
||||
else path.join(environ.get('VIZDOOM_ROOT'), 'scenarios')
|
||||
|
||||
self.game = vizdoom.DoomGame()
|
||||
self.game.load_config(path.join(self.scenarios_dir, self.level.value))
|
||||
self.game.set_window_visible(False)
|
||||
self.game.add_game_args("+vid_forcesurface 1")
|
||||
|
||||
self.wait_for_explicit_human_action = True
|
||||
if self.human_control:
|
||||
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_640X480)
|
||||
elif self.is_rendered:
|
||||
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_320X240)
|
||||
else:
|
||||
# lower resolution since we actually take only 76x60 and we don't need to render
|
||||
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_160X120)
|
||||
|
||||
self.game.set_render_hud(False)
|
||||
self.game.set_render_crosshair(False)
|
||||
self.game.set_render_decals(False)
|
||||
self.game.set_render_particles(False)
|
||||
for camera in self.cameras:
|
||||
if hasattr(self.game, 'set_{}_enabled'.format(camera.value[1])):
|
||||
getattr(self.game, 'set_{}_enabled'.format(camera.value[1]))(True)
|
||||
self.game.init()
|
||||
|
||||
# actions
|
||||
actions_description = ['NO-OP']
|
||||
actions_description += [str(action).split(".")[1] for action in self.game.get_available_buttons()]
|
||||
actions_description = actions_description[::-1]
|
||||
self.action_space = MultiSelectActionSpace(self.game.get_available_buttons_size(),
|
||||
max_simultaneous_selected_actions=1,
|
||||
descriptions=actions_description,
|
||||
allow_no_action_to_be_selected=True)
|
||||
|
||||
# human control
|
||||
if self.human_control:
|
||||
# TODO: add this to the action space
|
||||
# map keyboard keys to actions
|
||||
for idx, action in enumerate(self.action_space.descriptions):
|
||||
if action in key_map.keys():
|
||||
self.key_to_action[(key_map[action],)] = idx
|
||||
|
||||
# states
|
||||
self.state_space = StateSpace({
|
||||
"measurements": VectorObservationSpace(self.game.get_state().game_variables.shape[0],
|
||||
measurements_names=[str(m) for m in
|
||||
self.game.get_available_game_variables()])
|
||||
})
|
||||
for camera in self.cameras:
|
||||
self.state_space[camera.value[0]] = ImageObservationSpace(
|
||||
shape=np.array([self.game.get_screen_height(), self.game.get_screen_width(), 3]),
|
||||
high=255)
|
||||
|
||||
# seed
|
||||
if seed is not None:
|
||||
self.game.set_seed(seed)
|
||||
self.reset_internal_state()
|
||||
|
||||
# render
|
||||
if self.is_rendered:
|
||||
image = self.get_rendered_image()
|
||||
self.renderer.create_screen(image.shape[1], image.shape[0])
|
||||
|
||||
def _update_state(self):
|
||||
# extract all data from the current state
|
||||
state = self.game.get_state()
|
||||
if state is not None and state.screen_buffer is not None:
|
||||
self.measurements = state.game_variables
|
||||
self.state = {'measurements': self.measurements}
|
||||
for camera in self.cameras:
|
||||
observation = getattr(state, camera.value[1])
|
||||
if len(observation.shape) == 3:
|
||||
self.state[camera.value[0]] = np.transpose(observation, (1, 2, 0))
|
||||
elif len(observation.shape) == 2:
|
||||
self.state[camera.value[0]] = np.repeat(np.expand_dims(observation, -1), 3, axis=-1)
|
||||
|
||||
self.reward = self.game.get_last_reward()
|
||||
self.done = self.game.is_episode_finished()
|
||||
|
||||
def _take_action(self, action):
|
||||
self.game.make_action(list(action), self.frame_skip)
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False):
|
||||
self.game.new_episode()
|
||||
|
||||
def get_rendered_image(self) -> np.ndarray:
|
||||
"""
|
||||
Return a numpy array containing the image that will be rendered to the screen.
|
||||
This can be different from the observation. For example, mujoco's observation is a measurements vector.
|
||||
:return: numpy array containing the image that will be rendered to the screen
|
||||
"""
|
||||
image = [self.state[camera.value[0]] for camera in self.cameras]
|
||||
image = np.vstack(image)
|
||||
return image
|
||||
540
rl_coach/environments/environment.py
Normal file
540
rl_coach/environments/environment.py
Normal file
@@ -0,0 +1,540 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import operator
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
from typing import Union, List, Tuple, Dict
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.base_parameters import Parameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.core_types import GoalType, ActionType, EnvResponse, RunPhase
|
||||
from rl_coach.renderer import Renderer
|
||||
from rl_coach.spaces import ActionSpace, ObservationSpace, DiscreteActionSpace, RewardSpace, StateSpace
|
||||
from rl_coach.utils import squeeze_list, force_list
|
||||
|
||||
from rl_coach import logger
|
||||
from rl_coach.environments.environment_interface import EnvironmentInterface
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
class LevelSelection(object):
|
||||
def __init__(self, level: str):
|
||||
self.selected_level = level
|
||||
|
||||
def select(self, level: str):
|
||||
self.selected_level = level
|
||||
|
||||
def __str__(self):
|
||||
if self.selected_level is None:
|
||||
logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
|
||||
"or change the level in the preset.", crash=True)
|
||||
return self.selected_level
|
||||
|
||||
|
||||
class SingleLevelSelection(LevelSelection):
|
||||
def __init__(self, levels: Union[str, List[str], Dict[str, str]]):
|
||||
super().__init__(None)
|
||||
self.levels = levels
|
||||
if isinstance(levels, list):
|
||||
self.levels = {level: level for level in levels}
|
||||
if isinstance(levels, str):
|
||||
self.levels = {levels: levels}
|
||||
|
||||
def __str__(self):
|
||||
if self.selected_level is None:
|
||||
logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
|
||||
"or change the level in the preset. \nThe available levels are: \n{}"
|
||||
.format(', '.join(self.levels.keys())), crash=True)
|
||||
if self.selected_level not in self.levels.keys():
|
||||
logger.screen.error("The selected level ({}) is not part of the available levels ({})"
|
||||
.format(self.selected_level, ', '.join(self.levels.keys())), crash=True)
|
||||
return self.levels[self.selected_level]
|
||||
|
||||
|
||||
# class SingleLevelPerPhase(LevelSelection):
|
||||
# def __init__(self, levels: Dict[RunPhase, str]):
|
||||
# super().__init__(None)
|
||||
# self.levels = levels
|
||||
#
|
||||
# def __str__(self):
|
||||
# super().__str__()
|
||||
# if self.selected_level not in self.levels.keys():
|
||||
# logger.screen.error("The selected level ({}) is not part of the available levels ({})"
|
||||
# .format(self.selected_level, self.levels.keys()), crash=True)
|
||||
# return self.levels[self.selected_level]
|
||||
|
||||
|
||||
class CustomWrapper(object):
|
||||
def __init__(self, environment):
|
||||
super().__init__()
|
||||
self.environment = environment
|
||||
|
||||
def __getattr__(self, attr):
|
||||
if attr in self.__dict__:
|
||||
return self.__dict__[attr]
|
||||
else:
|
||||
return getattr(self.environment, attr, False)
|
||||
|
||||
|
||||
class EnvironmentParameters(Parameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.level = None
|
||||
self.frame_skip = 4
|
||||
self.seed = None
|
||||
self.human_control = False
|
||||
self.custom_reward_threshold = None
|
||||
self.default_input_filter = None
|
||||
self.default_output_filter = None
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.environment:Environment'
|
||||
|
||||
|
||||
class Environment(EnvironmentInterface):
|
||||
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
|
||||
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
|
||||
**kwargs):
|
||||
"""
|
||||
:param level: The environment level. Each environment can have multiple levels
|
||||
:param seed: a seed for the random number generator of the environment
|
||||
:param frame_skip: number of frames to skip (while repeating the same action) between each two agent directives
|
||||
:param human_control: human should control the environment
|
||||
:param visualization_parameters: a blob of parameters used for visualization of the environment
|
||||
:param **kwargs: as the class is instantiated by EnvironmentParameters, this is used to support having
|
||||
additional arguments which will be ignored by this class, but might be used by others
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
# env initialization
|
||||
|
||||
self.game = []
|
||||
|
||||
self.state = {}
|
||||
self.observation = None
|
||||
self.goal = None
|
||||
self.reward = 0
|
||||
self.done = False
|
||||
self.info = {}
|
||||
self._last_env_response = None
|
||||
self.last_action = 0
|
||||
self.episode_idx = 0
|
||||
self.total_steps_counter = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.last_episode_time = time.time()
|
||||
self.key_to_action = {}
|
||||
self.last_episode_images = []
|
||||
|
||||
# rewards
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.max_reward_achieved = -np.inf
|
||||
self.reward_success_threshold = custom_reward_threshold
|
||||
|
||||
# spaces
|
||||
self.state_space = self._state_space = None
|
||||
self.goal_space = self._goal_space = None
|
||||
self.action_space = self._action_space = None
|
||||
self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold) # TODO: add a getter and setter
|
||||
|
||||
self.env_id = str(level)
|
||||
self.seed = seed
|
||||
self.frame_skip = frame_skip
|
||||
|
||||
# human interaction and visualization
|
||||
self.human_control = human_control
|
||||
self.wait_for_explicit_human_action = False
|
||||
self.is_rendered = visualization_parameters.render or self.human_control
|
||||
self.native_rendering = visualization_parameters.native_rendering or self.human_control
|
||||
self.visualization_parameters = visualization_parameters
|
||||
if not self.native_rendering:
|
||||
self.renderer = Renderer()
|
||||
|
||||
@property
|
||||
def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
|
||||
"""
|
||||
Get the action space of the environment
|
||||
:return: the action space
|
||||
"""
|
||||
return self._action_space
|
||||
|
||||
@action_space.setter
|
||||
def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
|
||||
"""
|
||||
Set the action space of the environment
|
||||
:return: None
|
||||
"""
|
||||
self._action_space = val
|
||||
|
||||
@property
|
||||
def state_space(self) -> Union[List[StateSpace], StateSpace]:
|
||||
"""
|
||||
Get the state space of the environment
|
||||
:return: the observation space
|
||||
"""
|
||||
return self._state_space
|
||||
|
||||
@state_space.setter
|
||||
def state_space(self, val: Union[List[StateSpace], StateSpace]):
|
||||
"""
|
||||
Set the state space of the environment
|
||||
:return: None
|
||||
"""
|
||||
self._state_space = val
|
||||
|
||||
@property
|
||||
def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
|
||||
"""
|
||||
Get the state space of the environment
|
||||
:return: the observation space
|
||||
"""
|
||||
return self._goal_space
|
||||
|
||||
@goal_space.setter
|
||||
def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
|
||||
"""
|
||||
Set the goal space of the environment
|
||||
:return: None
|
||||
"""
|
||||
self._goal_space = val
|
||||
|
||||
def get_action_from_user(self) -> ActionType:
|
||||
"""
|
||||
Get an action from the user keyboard
|
||||
:return: action index
|
||||
"""
|
||||
if self.wait_for_explicit_human_action:
|
||||
while len(self.renderer.pressed_keys) == 0:
|
||||
self.renderer.get_events()
|
||||
|
||||
if self.key_to_action == {}:
|
||||
# the keys are the numbers on the keyboard corresponding to the action index
|
||||
if len(self.renderer.pressed_keys) > 0:
|
||||
action_idx = self.renderer.pressed_keys[0] - ord("1")
|
||||
if 0 <= action_idx < self.action_space.shape[0]:
|
||||
return action_idx
|
||||
else:
|
||||
# the keys are mapped through the environment to more intuitive keyboard keys
|
||||
# key = tuple(self.renderer.pressed_keys)
|
||||
# for key in self.renderer.pressed_keys:
|
||||
for env_keys in self.key_to_action.keys():
|
||||
if set(env_keys) == set(self.renderer.pressed_keys):
|
||||
return self.action_space.actions[self.key_to_action[env_keys]]
|
||||
|
||||
# return the default action 0 so that the environment will continue running
|
||||
return self.action_space.default_action
|
||||
|
||||
@property
|
||||
def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
|
||||
"""
|
||||
Get the last environment response
|
||||
:return: a dictionary that contains the state, reward, etc.
|
||||
"""
|
||||
return squeeze_list(self._last_env_response)
|
||||
|
||||
@last_env_response.setter
|
||||
def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
|
||||
"""
|
||||
Set the last environment response
|
||||
:param val: the last environment response
|
||||
"""
|
||||
self._last_env_response = force_list(val)
|
||||
|
||||
def step(self, action: ActionType) -> EnvResponse:
|
||||
"""
|
||||
Make a single step in the environment using the given action
|
||||
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
|
||||
:return: the environment response as returned in get_last_env_response
|
||||
"""
|
||||
action = self.action_space.clip_action_to_space(action)
|
||||
if self.action_space and not self.action_space.val_matches_space_definition(action):
|
||||
raise ValueError("The given action does not match the action space definition. "
|
||||
"Action = {}, action space definition = {}".format(action, self.action_space))
|
||||
|
||||
# store the last agent action done and allow passing None actions to repeat the previously done action
|
||||
if action is None:
|
||||
action = self.last_action
|
||||
self.last_action = action
|
||||
if self.visualization_parameters.add_rendered_image_to_env_response:
|
||||
current_rendered_image = self.get_rendered_image()
|
||||
|
||||
self.current_episode_steps_counter += 1
|
||||
if self.phase != RunPhase.UNDEFINED:
|
||||
self.total_steps_counter += 1
|
||||
|
||||
# act
|
||||
self._take_action(action)
|
||||
|
||||
# observe
|
||||
self._update_state()
|
||||
|
||||
if self.is_rendered:
|
||||
self.render()
|
||||
|
||||
self.total_reward_in_current_episode += self.reward
|
||||
|
||||
if self.visualization_parameters.add_rendered_image_to_env_response:
|
||||
self.info['image'] = current_rendered_image
|
||||
|
||||
self.last_env_response = \
|
||||
EnvResponse(
|
||||
reward=self.reward,
|
||||
next_state=self.state,
|
||||
goal=self.goal,
|
||||
game_over=self.done,
|
||||
info=self.info
|
||||
)
|
||||
|
||||
# store observations for video / gif dumping
|
||||
if self.should_dump_video_of_the_current_episode(episode_terminated=False) and \
|
||||
(self.visualization_parameters.dump_mp4 or self.visualization_parameters.dump_gifs):
|
||||
self.last_episode_images.append(self.get_rendered_image())
|
||||
|
||||
return self.last_env_response
|
||||
|
||||
def render(self) -> None:
|
||||
"""
|
||||
Call the environment function for rendering to the screen
|
||||
"""
|
||||
if self.native_rendering:
|
||||
self._render()
|
||||
else:
|
||||
self.renderer.render_image(self.get_rendered_image())
|
||||
|
||||
def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
|
||||
"""
|
||||
Reset the environment and all the variable of the wrapper
|
||||
:param force_environment_reset: forces environment reset even when the game did not end
|
||||
:return: A dictionary containing the observation, reward, done flag, action and measurements
|
||||
"""
|
||||
|
||||
self.dump_video_of_last_episode_if_needed()
|
||||
self._restart_environment_episode(force_environment_reset)
|
||||
self.last_episode_time = time.time()
|
||||
|
||||
if self.current_episode_steps_counter > 0 and self.phase != RunPhase.UNDEFINED:
|
||||
self.episode_idx += 1
|
||||
|
||||
self.done = False
|
||||
self.total_reward_in_current_episode = self.reward = 0.0
|
||||
self.last_action = 0
|
||||
self.current_episode_steps_counter = 0
|
||||
self.last_episode_images = []
|
||||
self._update_state()
|
||||
|
||||
# render before the preprocessing of the observation, so that the image will be in its original quality
|
||||
if self.is_rendered:
|
||||
self.render()
|
||||
|
||||
self.last_env_response = \
|
||||
EnvResponse(
|
||||
reward=self.reward,
|
||||
next_state=self.state,
|
||||
goal=self.goal,
|
||||
game_over=self.done,
|
||||
info=self.info
|
||||
)
|
||||
|
||||
return self.last_env_response
|
||||
|
||||
def get_random_action(self) -> ActionType:
|
||||
"""
|
||||
Returns an action picked uniformly from the available actions
|
||||
:return: a numpy array with a random action
|
||||
"""
|
||||
return self.action_space.sample()
|
||||
|
||||
def get_available_keys(self) -> List[Tuple[str, ActionType]]:
|
||||
"""
|
||||
Return a list of tuples mapping between action names and the keyboard key that triggers them
|
||||
:return: a list of tuples mapping between action names and the keyboard key that triggers them
|
||||
"""
|
||||
available_keys = []
|
||||
if self.key_to_action != {}:
|
||||
for key, idx in sorted(self.key_to_action.items(), key=operator.itemgetter(1)):
|
||||
if key != ():
|
||||
key_names = [self.renderer.get_key_names([k])[0] for k in key]
|
||||
available_keys.append((self.action_space.descriptions[idx], ' + '.join(key_names)))
|
||||
elif type(self.action_space) == DiscreteActionSpace:
|
||||
for action in range(self.action_space.shape):
|
||||
available_keys.append(("Action {}".format(action + 1), action + 1))
|
||||
return available_keys
|
||||
|
||||
def get_goal(self) -> GoalType:
|
||||
"""
|
||||
Get the current goal that the agents needs to achieve in the environment
|
||||
:return: The goal
|
||||
"""
|
||||
return self.goal
|
||||
|
||||
def set_goal(self, goal: GoalType) -> None:
|
||||
"""
|
||||
Set the current goal that the agent needs to achieve in the environment
|
||||
:param goal: the goal that needs to be achieved
|
||||
:return: None
|
||||
"""
|
||||
self.goal = goal
|
||||
|
||||
def should_dump_video_of_the_current_episode(self, episode_terminated=False):
|
||||
if self.visualization_parameters.video_dump_methods:
|
||||
for video_dump_method in force_list(self.visualization_parameters.video_dump_methods):
|
||||
if not video_dump_method.should_dump(episode_terminated, **self.__dict__):
|
||||
return False
|
||||
return True
|
||||
return False
|
||||
|
||||
def dump_video_of_last_episode_if_needed(self):
|
||||
if self.visualization_parameters.video_dump_methods and self.last_episode_images != []:
|
||||
if self.should_dump_video_of_the_current_episode(episode_terminated=True):
|
||||
self.dump_video_of_last_episode()
|
||||
|
||||
def dump_video_of_last_episode(self):
|
||||
frame_skipping = max(1, int(5 / self.frame_skip))
|
||||
file_name = 'episode-{}_score-{}'.format(self.episode_idx, self.total_reward_in_current_episode)
|
||||
fps = 10
|
||||
if self.visualization_parameters.dump_gifs:
|
||||
logger.create_gif(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
|
||||
if self.visualization_parameters.dump_mp4:
|
||||
logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
|
||||
|
||||
def log_to_screen(self):
|
||||
# log to screen
|
||||
log = OrderedDict()
|
||||
log["Episode"] = self.episode_idx
|
||||
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
|
||||
log["Steps"] = self.total_steps_counter
|
||||
screen.log_dict(log, prefix=self.phase.value)
|
||||
|
||||
# The following functions define the interaction with the environment.
|
||||
# Any new environment that inherits the Environment class should use these signatures.
|
||||
# Some of these functions are optional - please read their description for more details.
|
||||
|
||||
def _take_action(self, action_idx: ActionType) -> None:
|
||||
"""
|
||||
An environment dependent function that sends an action to the simulator.
|
||||
:param action_idx: the action to perform on the environment
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def _update_state(self) -> None:
|
||||
"""
|
||||
Updates the state from the environment.
|
||||
Should update self.observation, self.reward, self.done, self.measurements and self.info
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False) -> None:
|
||||
"""
|
||||
Restarts the simulator episode
|
||||
:param force_environment_reset: Force the environment to reset even if the episode is not done yet.
|
||||
:return: None
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def _render(self) -> None:
|
||||
"""
|
||||
Renders the environment using the native simulator renderer
|
||||
:return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_rendered_image(self) -> np.ndarray:
|
||||
"""
|
||||
Return a numpy array containing the image that will be rendered to the screen.
|
||||
This can be different from the observation. For example, mujoco's observation is a measurements vector.
|
||||
:return: numpy array containing the image that will be rendered to the screen
|
||||
"""
|
||||
return np.transpose(self.state['observation'], [1, 2, 0])
|
||||
|
||||
|
||||
"""
|
||||
Video Dumping Methods
|
||||
"""
|
||||
|
||||
|
||||
class VideoDumpMethod(object):
|
||||
"""
|
||||
Method used to decide when to dump videos
|
||||
"""
|
||||
def should_dump(self, episode_terminated=False, **kwargs):
|
||||
raise NotImplementedError("")
|
||||
|
||||
|
||||
class AlwaysDumpMethod(VideoDumpMethod):
|
||||
"""
|
||||
Dump video for every episode
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def should_dump(self, episode_terminated=False, **kwargs):
|
||||
return True
|
||||
|
||||
|
||||
class MaxDumpMethod(VideoDumpMethod):
|
||||
"""
|
||||
Dump video every time a new max total reward has been achieved
|
||||
"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.max_reward_achieved = -np.inf
|
||||
|
||||
def should_dump(self, episode_terminated=False, **kwargs):
|
||||
# if the episode has not finished yet we want to be prepared for dumping a video
|
||||
if not episode_terminated:
|
||||
return True
|
||||
if kwargs['total_reward_in_current_episode'] > self.max_reward_achieved:
|
||||
self.max_reward_achieved = kwargs['total_reward_in_current_episode']
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class EveryNEpisodesDumpMethod(object):
|
||||
"""
|
||||
Dump videos once in every N episodes
|
||||
"""
|
||||
def __init__(self, num_episodes_between_dumps: int):
|
||||
super().__init__()
|
||||
self.num_episodes_between_dumps = num_episodes_between_dumps
|
||||
self.last_dumped_episode = 0
|
||||
if num_episodes_between_dumps < 1:
|
||||
raise ValueError("the number of episodes between dumps should be a positive number")
|
||||
|
||||
def should_dump(self, episode_terminated=False, **kwargs):
|
||||
if kwargs['episode_idx'] >= self.last_dumped_episode + self.num_episodes_between_dumps - 1:
|
||||
self.last_dumped_episode = kwargs['episode_idx']
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class SelectedPhaseOnlyDumpMethod(object):
|
||||
"""
|
||||
Dump videos when the phase of the environment matches a predefined phase
|
||||
"""
|
||||
def __init__(self, run_phases: Union[RunPhase, List[RunPhase]]):
|
||||
self.run_phases = force_list(run_phases)
|
||||
|
||||
def should_dump(self, episode_terminated=False, **kwargs):
|
||||
if kwargs['_phase'] in self.run_phases:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
149
rl_coach/environments/environment_group.py
Normal file
149
rl_coach/environments/environment_group.py
Normal file
@@ -0,0 +1,149 @@
|
||||
|
||||
########################################################################################################################
|
||||
####### Currently we are ignoring more complex cases including EnvironmentGroups - DO NOT USE THIS FILE ****************
|
||||
########################################################################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
# #
|
||||
# # Copyright (c) 2017 Intel Corporation
|
||||
# #
|
||||
# # Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# # you may not use this file except in compliance with the License.
|
||||
# # You may obtain a copy of the License at
|
||||
# #
|
||||
# # http://www.apache.org/licenses/LICENSE-2.0
|
||||
# #
|
||||
# # Unless required by applicable law or agreed to in writing, software
|
||||
# # distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# # See the License for the specific language governing permissions and
|
||||
# # limitations under the License.
|
||||
# #
|
||||
#
|
||||
# from typing import Union, List, Dict
|
||||
# import numpy as np
|
||||
# from environments import create_environment
|
||||
# from environments.environment import Environment
|
||||
# from environments.environment_interface import EnvironmentInterface, ActionType, ActionSpace
|
||||
# from core_types import GoalType, Transition
|
||||
#
|
||||
#
|
||||
# class EnvironmentGroup(EnvironmentInterface):
|
||||
# """
|
||||
# An EnvironmentGroup is a group of different environments.
|
||||
# In the simple case, it will contain a single environment. But it can also contain multiple environments,
|
||||
# where the agent can then act on them as a batch, such that the prediction of the action is more efficient.
|
||||
# """
|
||||
# def __init__(self, environments_parameters: List[Environment]):
|
||||
# self.environments_parameters = environments_parameters
|
||||
# self.environments = []
|
||||
# self.action_space = []
|
||||
# self.outgoing_control = []
|
||||
# self._last_env_response = []
|
||||
#
|
||||
# @property
|
||||
# def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
|
||||
# """
|
||||
# Get the action space of the environment
|
||||
# :return: the action space
|
||||
# """
|
||||
# return self.action_space
|
||||
#
|
||||
# @action_space.setter
|
||||
# def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
|
||||
# """
|
||||
# Set the action space of the environment
|
||||
# :return: None
|
||||
# """
|
||||
# self.action_space = val
|
||||
#
|
||||
# @property
|
||||
# def phase(self) -> RunPhase:
|
||||
# """
|
||||
# Get the phase of the environments group
|
||||
# :return: the current phase
|
||||
# """
|
||||
# return self.phase
|
||||
#
|
||||
# @phase.setter
|
||||
# def phase(self, val: RunPhase):
|
||||
# """
|
||||
# Change the phase of each one of the environments in the group
|
||||
# :param val: the new phase
|
||||
# :return: None
|
||||
# """
|
||||
# self.phase = val
|
||||
# call_method_for_all(self.environments, 'phase', val)
|
||||
#
|
||||
# def _create_environments(self):
|
||||
# """
|
||||
# Create the environments using the given parameters and update the environments list
|
||||
# :return: None
|
||||
# """
|
||||
# for environment_parameters in self.environments_parameters:
|
||||
# environment = create_environment(environment_parameters)
|
||||
# self.action_space = self.action_space.append(environment.action_space)
|
||||
# self.environments.append(environment)
|
||||
#
|
||||
# @property
|
||||
# def last_env_response(self) -> Union[List[Transition], Transition]:
|
||||
# """
|
||||
# Get the last environment response
|
||||
# :return: a dictionary that contains the state, reward, etc.
|
||||
# """
|
||||
# return squeeze_list(self._last_env_response)
|
||||
#
|
||||
# @last_env_response.setter
|
||||
# def last_env_response(self, val: Union[List[Transition], Transition]):
|
||||
# """
|
||||
# Set the last environment response
|
||||
# :param val: the last environment response
|
||||
# """
|
||||
# self._last_env_response = force_list(val)
|
||||
#
|
||||
# def step(self, actions: Union[List[ActionType], ActionType]) -> List[Transition]:
|
||||
# """
|
||||
# Act in all the environments in the group.
|
||||
# :param actions: can be either a single action if there is a single environment in the group, or a list of
|
||||
# actions in case there are multiple environments in the group. Each action can be an action index
|
||||
# or a numpy array representing a continuous action for example.
|
||||
# :return: The responses from all the environments in the group
|
||||
# """
|
||||
#
|
||||
# actions = force_list(actions)
|
||||
# if len(actions) != len(self.environments):
|
||||
# raise ValueError("The number of actions does not match the number of environments in the group")
|
||||
#
|
||||
# result = []
|
||||
# for environment, action in zip(self.environments, actions):
|
||||
# result.append(environment.step(action))
|
||||
#
|
||||
# self.last_env_response = result
|
||||
#
|
||||
# return result
|
||||
#
|
||||
# def reset(self, force_environment_reset: bool=False) -> List[Transition]:
|
||||
# """
|
||||
# Reset all the environments in the group
|
||||
# :param force_environment_reset: force the reset of each one of the environments
|
||||
# :return: a list of the environments responses
|
||||
# """
|
||||
# return call_method_for_all(self.environments, 'reset', force_environment_reset)
|
||||
#
|
||||
# def get_random_action(self) -> List[ActionType]:
|
||||
# """
|
||||
# Get a list of random action that can be applied on the environments in the group
|
||||
# :return: a list of random actions
|
||||
# """
|
||||
# return call_method_for_all(self.environments, 'get_random_action')
|
||||
#
|
||||
# def set_goal(self, goal: GoalType) -> None:
|
||||
# """
|
||||
# Set the goal of each one of the environments in the group to be the given goal
|
||||
# :param goal: a goal vector
|
||||
# :return: None
|
||||
# """
|
||||
# # TODO: maybe enable setting multiple goals?
|
||||
# call_method_for_all(self.environments, 'set_goal', goal)
|
||||
76
rl_coach/environments/environment_interface.py
Normal file
76
rl_coach/environments/environment_interface.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from typing import Union, Dict
|
||||
|
||||
from rl_coach.spaces import ActionSpace
|
||||
|
||||
from rl_coach.core_types import ActionType, EnvResponse, RunPhase
|
||||
|
||||
|
||||
class EnvironmentInterface(object):
|
||||
def __init__(self):
|
||||
self._phase = RunPhase.UNDEFINED
|
||||
|
||||
@property
|
||||
def phase(self) -> RunPhase:
|
||||
"""
|
||||
Get the phase of the environment
|
||||
:return: the current phase
|
||||
"""
|
||||
return self._phase
|
||||
|
||||
@phase.setter
|
||||
def phase(self, val: RunPhase):
|
||||
"""
|
||||
Change the phase of the environment
|
||||
:param val: the new phase
|
||||
:return: None
|
||||
"""
|
||||
self._phase = val
|
||||
|
||||
@property
|
||||
def action_space(self) -> Union[Dict[str, ActionSpace], ActionSpace]:
|
||||
"""
|
||||
Get the action space of the environment (or of each of the agents wrapped in this environment.
|
||||
i.e. in the LevelManager case")
|
||||
:return: the action space
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def get_random_action(self) -> ActionType:
|
||||
"""
|
||||
Get a random action from the environment action space
|
||||
:return: An action that follows the definition of the action space.
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def step(self, action: ActionType) -> Union[None, EnvResponse]:
|
||||
"""
|
||||
Make a single step in the environment using the given action
|
||||
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
|
||||
:return: the environment response as returned in get_last_env_response or None for LevelManager
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
|
||||
def reset_internal_state(self, force_environment_reset: bool=False) -> Union[None, EnvResponse]:
|
||||
"""
|
||||
Reset the environment episode
|
||||
:param force_environment_reset: in some cases, resetting the environment can be suppressed by the environment
|
||||
itself. This flag allows force the reset.
|
||||
:return: the environment response as returned in get_last_env_response or None for LevelManager
|
||||
"""
|
||||
raise NotImplementedError("")
|
||||
454
rl_coach/environments/gym_environment.py
Normal file
454
rl_coach/environments/gym_environment.py
Normal file
@@ -0,0 +1,454 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
import scipy.ndimage
|
||||
|
||||
from rl_coach.utils import lower_under_to_upper, short_dynamic_import
|
||||
|
||||
try:
|
||||
import roboschool
|
||||
from OpenGL import GL
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("RoboSchool")
|
||||
|
||||
try:
|
||||
from rl_coach.gym_extensions.continuous import mujoco
|
||||
except:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("GymExtensions")
|
||||
|
||||
try:
|
||||
import pybullet_envs
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("PyBullet")
|
||||
|
||||
from typing import Dict, Any, Union
|
||||
from rl_coach.core_types import RunPhase
|
||||
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
|
||||
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, ImageObservationSpace, VectorObservationSpace, \
|
||||
StateSpace, RewardSpace
|
||||
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
|
||||
from rl_coach.filters.reward.reward_clipping_filter import RewardClippingFilter
|
||||
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
|
||||
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
|
||||
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
|
||||
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
|
||||
from rl_coach.filters.filter import InputFilter
|
||||
import random
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.logger import screen
|
||||
|
||||
|
||||
# Parameters
|
||||
|
||||
class GymEnvironmentParameters(EnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.random_initialization_steps = 0
|
||||
self.max_over_num_frames = 1
|
||||
self.additional_simulator_parameters = None
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.gym_environment:GymEnvironment'
|
||||
|
||||
|
||||
"""
|
||||
Roboschool Environment Components
|
||||
"""
|
||||
RoboSchoolInputFilters = NoInputFilter()
|
||||
RoboSchoolOutputFilters = NoOutputFilter()
|
||||
|
||||
|
||||
class Roboschool(GymEnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame_skip = 1
|
||||
self.default_input_filter = RoboSchoolInputFilters
|
||||
self.default_output_filter = RoboSchoolOutputFilters
|
||||
|
||||
|
||||
gym_roboschool_envs = ['inverted_pendulum', 'inverted_pendulum_swingup', 'inverted_double_pendulum', 'reacher',
|
||||
'hopper', 'walker2d', 'half_cheetah', 'ant', 'humanoid', 'humanoid_flagrun',
|
||||
'humanoid_flagrun_harder', 'pong']
|
||||
roboschool_v0 = {e: "{}".format(lower_under_to_upper(e) + '-v0') for e in gym_roboschool_envs}
|
||||
|
||||
"""
|
||||
Mujoco Environment Components
|
||||
"""
|
||||
MujocoInputFilter = NoInputFilter()
|
||||
MujocoOutputFilter = NoOutputFilter()
|
||||
|
||||
|
||||
class Mujoco(GymEnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame_skip = 1
|
||||
self.default_input_filter = MujocoInputFilter
|
||||
self.default_output_filter = MujocoOutputFilter
|
||||
|
||||
|
||||
gym_mujoco_envs = ['inverted_pendulum', 'inverted_double_pendulum', 'reacher', 'hopper', 'walker2d', 'half_cheetah',
|
||||
'ant', 'swimmer', 'humanoid', 'humanoid_standup', 'pusher', 'thrower', 'striker']
|
||||
|
||||
mujoco_v2 = {e: "{}".format(lower_under_to_upper(e) + '-v2') for e in gym_mujoco_envs}
|
||||
mujoco_v2['walker2d'] = 'Walker2d-v2'
|
||||
|
||||
gym_fetch_envs = ['reach', 'slide', 'push', 'pick_and_place']
|
||||
fetch_v1 = {e: "{}".format('Fetch' + lower_under_to_upper(e) + '-v1') for e in gym_fetch_envs}
|
||||
|
||||
"""
|
||||
Bullet Environment Components
|
||||
"""
|
||||
BulletInputFilter = NoInputFilter()
|
||||
BulletOutputFilter = NoOutputFilter()
|
||||
|
||||
|
||||
class Bullet(GymEnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame_skip = 1
|
||||
self.default_input_filter = BulletInputFilter
|
||||
self.default_output_filter = BulletOutputFilter
|
||||
|
||||
|
||||
"""
|
||||
Atari Environment Components
|
||||
"""
|
||||
|
||||
AtariInputFilter = InputFilter(is_a_reference_filter=True)
|
||||
AtariInputFilter.add_reward_filter('clipping', RewardClippingFilter(-1.0, 1.0))
|
||||
AtariInputFilter.add_observation_filter('observation', 'rescaling',
|
||||
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]),
|
||||
high=255)))
|
||||
AtariInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
|
||||
AtariInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
|
||||
AtariInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(4))
|
||||
AtariOutputFilter = NoOutputFilter()
|
||||
|
||||
|
||||
class Atari(GymEnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.frame_skip = 4
|
||||
self.max_over_num_frames = 2
|
||||
self.random_initialization_steps = 30
|
||||
self.default_input_filter = AtariInputFilter
|
||||
self.default_output_filter = AtariOutputFilter
|
||||
|
||||
|
||||
gym_atari_envs = ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis',
|
||||
'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival',
|
||||
'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk',
|
||||
'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar',
|
||||
'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master',
|
||||
'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan',
|
||||
'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing',
|
||||
'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down',
|
||||
'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']
|
||||
atari_deterministic_v4 = {e: "{}".format(lower_under_to_upper(e) + 'Deterministic-v4') for e in gym_atari_envs}
|
||||
atari_no_frameskip_v4 = {e: "{}".format(lower_under_to_upper(e) + 'NoFrameskip-v4') for e in gym_atari_envs}
|
||||
|
||||
|
||||
class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
|
||||
def __init__(self, env, frameskip=4, max_over_num_frames=2):
|
||||
super().__init__(env)
|
||||
self.max_over_num_frames = max_over_num_frames
|
||||
self.observations_stack = []
|
||||
self.frameskip = frameskip
|
||||
self.first_frame_to_max_over = self.frameskip - self.max_over_num_frames
|
||||
|
||||
def reset(self):
|
||||
return self.env.reset()
|
||||
|
||||
def step(self, action):
|
||||
total_reward = 0.0
|
||||
done = None
|
||||
info = None
|
||||
self.observations_stack = []
|
||||
for i in range(self.frameskip):
|
||||
observation, reward, done, info = self.env.step(action)
|
||||
if i >= self.first_frame_to_max_over:
|
||||
self.observations_stack.append(observation)
|
||||
total_reward += reward
|
||||
if done:
|
||||
# deal with last state in episode
|
||||
if not self.observations_stack:
|
||||
self.observations_stack.append(observation)
|
||||
break
|
||||
|
||||
max_over_frames_observation = np.max(self.observations_stack, axis=0)
|
||||
|
||||
return max_over_frames_observation, total_reward, done, info
|
||||
|
||||
|
||||
# Environment
|
||||
class GymEnvironment(Environment):
|
||||
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
|
||||
additional_simulator_parameters: Dict[str, Any] = None, seed: Union[None, int]=None,
|
||||
human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
|
||||
random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
|
||||
visualization_parameters)
|
||||
|
||||
self.random_initialization_steps = random_initialization_steps
|
||||
self.max_over_num_frames = max_over_num_frames
|
||||
self.additional_simulator_parameters = additional_simulator_parameters
|
||||
|
||||
# hide warnings
|
||||
gym.logger.set_level(40)
|
||||
|
||||
"""
|
||||
load and initialize environment
|
||||
environment ids can be defined in 3 ways:
|
||||
1. Native gym environments like BreakoutDeterministic-v0 for example
|
||||
2. Custom gym environments written and installed as python packages.
|
||||
This environments should have a python module with a class inheriting gym.Env, implementing the
|
||||
relevant functions (_reset, _step, _render) and defining the observation and action space
|
||||
For example: my_environment_package:MyEnvironmentClass will run an environment defined in the
|
||||
MyEnvironmentClass class
|
||||
3. Custom gym environments written as an independent module which is not installed.
|
||||
This environments should have a python module with a class inheriting gym.Env, implementing the
|
||||
relevant functions (_reset, _step, _render) and defining the observation and action space.
|
||||
For example: path_to_my_environment.sub_directory.my_module:MyEnvironmentClass will run an
|
||||
environment defined in the MyEnvironmentClass class which is located in the module in the relative path
|
||||
path_to_my_environment.sub_directory.my_module
|
||||
"""
|
||||
if ':' in self.env_id:
|
||||
# custom environments
|
||||
if '/' in self.env_id or '.' in self.env_id:
|
||||
# environment in a an absolute path module written as a unix path or in a relative path module
|
||||
# written as a python import path
|
||||
env_class = short_dynamic_import(self.env_id)
|
||||
else:
|
||||
# environment in a python package
|
||||
env_class = gym.envs.registration.load(self.env_id)
|
||||
|
||||
# instantiate the environment
|
||||
if self.additional_simulator_parameters:
|
||||
self.env = env_class(**self.additional_simulator_parameters)
|
||||
else:
|
||||
self.env = env_class()
|
||||
else:
|
||||
self.env = gym.make(self.env_id)
|
||||
|
||||
# for classic control we want to use the native renderer because otherwise we will get 2 renderer windows
|
||||
environment_to_always_use_with_native_rendering = ['classic_control', 'mujoco', 'robotics']
|
||||
self.native_rendering = self.native_rendering or \
|
||||
any([env in str(self.env.unwrapped.__class__)
|
||||
for env in environment_to_always_use_with_native_rendering])
|
||||
if self.native_rendering:
|
||||
if hasattr(self, 'renderer'):
|
||||
self.renderer.close()
|
||||
|
||||
# seed
|
||||
if self.seed is not None:
|
||||
self.env.seed(self.seed)
|
||||
np.random.seed(self.seed)
|
||||
random.seed(self.seed)
|
||||
|
||||
# frame skip and max between consecutive frames
|
||||
self.is_robotics_env = 'robotics' in str(self.env.unwrapped.__class__)
|
||||
self.is_mujoco_env = 'mujoco' in str(self.env.unwrapped.__class__)
|
||||
self.is_atari_env = 'Atari' in str(self.env.unwrapped.__class__)
|
||||
self.timelimit_env_wrapper = self.env
|
||||
if self.is_atari_env:
|
||||
self.env.unwrapped.frameskip = 1 # this accesses the atari env that is wrapped with a timelimit wrapper env
|
||||
if self.env_id == "SpaceInvadersDeterministic-v4" and self.frame_skip == 4:
|
||||
screen.warning("Warning: The frame-skip for Space Invaders was automatically updated from 4 to 3. "
|
||||
"This is following the DQN paper where it was noticed that a frame-skip of 3 makes the "
|
||||
"laser rays disappear. To force frame-skip of 4, please use SpaceInvadersNoFrameskip-v4.")
|
||||
self.frame_skip = 3
|
||||
self.env = MaxOverFramesAndFrameskipEnvWrapper(self.env,
|
||||
frameskip=self.frame_skip,
|
||||
max_over_num_frames=self.max_over_num_frames)
|
||||
else:
|
||||
self.env.unwrapped.frameskip = self.frame_skip
|
||||
|
||||
self.state_space = StateSpace({})
|
||||
|
||||
# observations
|
||||
if not isinstance(self.env.observation_space, gym.spaces.dict_space.Dict):
|
||||
state_space = {'observation': self.env.observation_space}
|
||||
else:
|
||||
state_space = self.env.observation_space.spaces
|
||||
|
||||
for observation_space_name, observation_space in state_space.items():
|
||||
if len(observation_space.shape) == 3 and observation_space.shape[-1] == 3:
|
||||
# we assume gym has image observations which are RGB and where their values are within 0-255
|
||||
self.state_space[observation_space_name] = ImageObservationSpace(
|
||||
shape=np.array(observation_space.shape),
|
||||
high=255,
|
||||
channels_axis=-1
|
||||
)
|
||||
else:
|
||||
self.state_space[observation_space_name] = VectorObservationSpace(
|
||||
shape=observation_space.shape[0],
|
||||
low=observation_space.low,
|
||||
high=observation_space.high
|
||||
)
|
||||
if 'desired_goal' in state_space.keys():
|
||||
self.goal_space = self.state_space['desired_goal']
|
||||
|
||||
# actions
|
||||
if type(self.env.action_space) == gym.spaces.box.Box:
|
||||
self.action_space = BoxActionSpace(
|
||||
shape=self.env.action_space.shape,
|
||||
low=self.env.action_space.low,
|
||||
high=self.env.action_space.high
|
||||
)
|
||||
elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
|
||||
actions_description = []
|
||||
if hasattr(self.env.unwrapped, 'get_action_meanings'):
|
||||
actions_description = self.env.unwrapped.get_action_meanings()
|
||||
self.action_space = DiscreteActionSpace(
|
||||
num_actions=self.env.action_space.n,
|
||||
descriptions=actions_description
|
||||
)
|
||||
|
||||
if self.human_control:
|
||||
# TODO: add this to the action space
|
||||
# map keyboard keys to actions
|
||||
self.key_to_action = {}
|
||||
if hasattr(self.env.unwrapped, 'get_keys_to_action'):
|
||||
self.key_to_action = self.env.unwrapped.get_keys_to_action()
|
||||
|
||||
# initialize the state by getting a new state from the environment
|
||||
self.reset_internal_state(True)
|
||||
|
||||
# render
|
||||
if self.is_rendered:
|
||||
image = self.get_rendered_image()
|
||||
scale = 1
|
||||
if self.human_control:
|
||||
scale = 2
|
||||
if not self.native_rendering:
|
||||
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
|
||||
|
||||
# measurements
|
||||
if self.env.spec is not None:
|
||||
self.timestep_limit = self.env.spec.timestep_limit
|
||||
else:
|
||||
self.timestep_limit = None
|
||||
|
||||
# the info is only updated after the first step
|
||||
self.state = self.step(self.action_space.default_action).next_state
|
||||
self.state_space['measurements'] = VectorObservationSpace(shape=len(self.info.keys()))
|
||||
|
||||
if self.env.spec and custom_reward_threshold is None:
|
||||
self.reward_success_threshold = self.env.spec.reward_threshold
|
||||
self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold)
|
||||
|
||||
def _wrap_state(self, state):
|
||||
if not isinstance(self.env.observation_space, gym.spaces.Dict):
|
||||
return {'observation': state}
|
||||
return state
|
||||
|
||||
def _update_state(self):
|
||||
if self.is_atari_env and hasattr(self, 'current_ale_lives') \
|
||||
and self.current_ale_lives != self.env.unwrapped.ale.lives():
|
||||
if self.phase == RunPhase.TRAIN or self.phase == RunPhase.HEATUP:
|
||||
# signal termination for life loss
|
||||
self.done = True
|
||||
elif self.phase == RunPhase.TEST and not self.done:
|
||||
# the episode is not terminated in evaluation, but we need to press fire again
|
||||
self._press_fire()
|
||||
self._update_ale_lives()
|
||||
# TODO: update the measurements
|
||||
if self.state and "desired_goal" in self.state.keys():
|
||||
self.goal = self.state['desired_goal']
|
||||
|
||||
def _take_action(self, action):
|
||||
if type(self.action_space) == BoxActionSpace:
|
||||
action = self.action_space.clip_action_to_space(action)
|
||||
|
||||
self.state, self.reward, self.done, self.info = self.env.step(action)
|
||||
self.state = self._wrap_state(self.state)
|
||||
|
||||
def _random_noop(self):
|
||||
# simulate a random initial environment state by stepping for a random number of times between 0 and 30
|
||||
step_count = 0
|
||||
random_initialization_steps = random.randint(0, self.random_initialization_steps)
|
||||
while self.action_space is not None and (self.state is None or step_count < random_initialization_steps):
|
||||
step_count += 1
|
||||
self.step(self.action_space.default_action)
|
||||
|
||||
def _press_fire(self):
|
||||
fire_action = 1
|
||||
if self.is_atari_env and self.env.unwrapped.get_action_meanings()[fire_action] == 'FIRE':
|
||||
self.current_ale_lives = self.env.unwrapped.ale.lives()
|
||||
self.step(fire_action)
|
||||
if self.done:
|
||||
self.reset_internal_state()
|
||||
|
||||
def _update_ale_lives(self):
|
||||
if self.is_atari_env:
|
||||
self.current_ale_lives = self.env.unwrapped.ale.lives()
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False):
|
||||
# prevent reset of environment if there are ale lives left
|
||||
if (self.is_atari_env and self.env.unwrapped.ale.lives() > 0) \
|
||||
and not force_environment_reset and not self.timelimit_env_wrapper._past_limit():
|
||||
self.step(self.action_space.default_action)
|
||||
else:
|
||||
self.state = self.env.reset()
|
||||
self.state = self._wrap_state(self.state)
|
||||
self._update_ale_lives()
|
||||
|
||||
if self.is_atari_env:
|
||||
self._random_noop()
|
||||
self._press_fire()
|
||||
|
||||
# initialize the number of lives
|
||||
self._update_ale_lives()
|
||||
|
||||
def _set_mujoco_camera(self, camera_idx: int):
|
||||
"""
|
||||
This function can be used to set the camera for rendering the mujoco simulator
|
||||
:param camera_idx: The index of the camera to use. Should be defined in the model
|
||||
:return: None
|
||||
"""
|
||||
if self.env.unwrapped.viewer.cam.fixedcamid != camera_idx and self.env.unwrapped.viewer._ncam > camera_idx:
|
||||
from mujoco_py.generated import const
|
||||
self.env.unwrapped.viewer.cam.type = const.CAMERA_FIXED
|
||||
self.env.unwrapped.viewer.cam.fixedcamid = camera_idx
|
||||
|
||||
def _get_robotics_image(self):
|
||||
self.env.render()
|
||||
image = self.env.unwrapped._get_viewer().read_pixels(1600, 900, depth=False)[::-1, :, :]
|
||||
image = scipy.misc.imresize(image, (270, 480, 3))
|
||||
return image
|
||||
|
||||
def _render(self):
|
||||
self.env.render(mode='human')
|
||||
# required for setting up a fixed camera for mujoco
|
||||
if self.is_mujoco_env:
|
||||
self._set_mujoco_camera(0)
|
||||
|
||||
def get_rendered_image(self):
|
||||
if self.is_robotics_env:
|
||||
# necessary for fetch since the rendered image is cropped to an irrelevant part of the simulator
|
||||
image = self._get_robotics_image()
|
||||
else:
|
||||
image = self.env.render(mode='rgb_array')
|
||||
# required for setting up a fixed camera for mujoco
|
||||
if self.is_mujoco_env:
|
||||
self._set_mujoco_camera(0)
|
||||
return image
|
||||
0
rl_coach/environments/mujoco/__init__.py
Normal file
0
rl_coach/environments/mujoco/__init__.py
Normal file
38
rl_coach/environments/mujoco/common/__init__.py
Normal file
38
rl_coach/environments/mujoco/common/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# Copyright 2017 The dm_control Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Functions to manage the common assets for domains."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
from dm_control.utils import resources
|
||||
|
||||
_SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
|
||||
_FILENAMES = [
|
||||
"common/materials.xml",
|
||||
"common/skybox.xml",
|
||||
"common/visual.xml",
|
||||
]
|
||||
|
||||
ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
|
||||
for filename in _FILENAMES}
|
||||
|
||||
|
||||
def read_model(model_filename):
|
||||
"""Reads a model XML file and returns its contents as a string."""
|
||||
return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
|
||||
22
rl_coach/environments/mujoco/common/materials.xml
Normal file
22
rl_coach/environments/mujoco/common/materials.xml
Normal file
@@ -0,0 +1,22 @@
|
||||
<!--
|
||||
Common textures, colors and materials to be used throughout this suite. Some
|
||||
materials such as xxx_highlight are activated on occurence of certain events,
|
||||
for example receiving a positive reward.
|
||||
-->
|
||||
<mujoco>
|
||||
<asset>
|
||||
<texture name="grid" type="2d" builtin="checker" rgb1=".1 .2 .3" rgb2=".2 .3 .4" width="300" height="300" mark="edge" markrgb=".2 .3 .4"/>
|
||||
<material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
|
||||
<material name="self" rgba=".7 .5 .3 1"/>
|
||||
<material name="self_default" rgba=".7 .5 .3 1"/>
|
||||
<material name="self_highlight" rgba="0 .5 .3 1"/>
|
||||
<material name="effector" rgba=".7 .4 .2 1"/>
|
||||
<material name="effector_default" rgba=".7 .4 .2 1"/>
|
||||
<material name="effector_highlight" rgba="0 .5 .3 1"/>
|
||||
<material name="decoration" rgba=".3 .5 .7 1"/>
|
||||
<material name="eye" rgba="0 .2 1 1"/>
|
||||
<material name="target" rgba=".6 .3 .3 1"/>
|
||||
<material name="target_default" rgba=".6 .3 .3 1"/>
|
||||
<material name="target_highlight" rgba=".6 .3 .3 .4"/>
|
||||
</asset>
|
||||
</mujoco>
|
||||
6
rl_coach/environments/mujoco/common/skybox.xml
Normal file
6
rl_coach/environments/mujoco/common/skybox.xml
Normal file
@@ -0,0 +1,6 @@
|
||||
<mujoco>
|
||||
<asset>
|
||||
<texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0"
|
||||
width="800" height="800" mark="random" markrgb="1 1 1"/>
|
||||
</asset>
|
||||
</mujoco>
|
||||
7
rl_coach/environments/mujoco/common/visual.xml
Normal file
7
rl_coach/environments/mujoco/common/visual.xml
Normal file
@@ -0,0 +1,7 @@
|
||||
<mujoco>
|
||||
<visual>
|
||||
<headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1"/>
|
||||
<map znear=".01"/>
|
||||
<quality shadowsize="2048"/>
|
||||
</visual>
|
||||
</mujoco>
|
||||
185
rl_coach/environments/mujoco/pendulum_with_goals.py
Normal file
185
rl_coach/environments/mujoco/pendulum_with_goals.py
Normal file
@@ -0,0 +1,185 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
import os
|
||||
from gym import spaces
|
||||
from gym.envs.registration import EnvSpec
|
||||
|
||||
from mujoco_py import load_model_from_path, MjSim , MjViewer, MjRenderContextOffscreen
|
||||
|
||||
|
||||
class PendulumWithGoals(gym.Env):
|
||||
metadata = {
|
||||
'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
|
||||
}
|
||||
|
||||
def __init__(self, goal_reaching_thresholds=np.array([0.075, 0.075, 0.75]),
|
||||
goal_not_reached_penalty=-1, goal_reached_reward=0, terminate_on_goal_reaching=True,
|
||||
time_limit=1000, frameskip=1, random_goals_instead_of_standing_goal=False,
|
||||
polar_coordinates: bool=False):
|
||||
super().__init__()
|
||||
dir = os.path.dirname(__file__)
|
||||
model = load_model_from_path(dir + "/pendulum_with_goals.xml")
|
||||
|
||||
self.sim = MjSim(model)
|
||||
self.viewer = None
|
||||
self.rgb_viewer = None
|
||||
|
||||
self.frameskip = frameskip
|
||||
self.goal = None
|
||||
self.goal_reaching_thresholds = goal_reaching_thresholds
|
||||
self.goal_not_reached_penalty = goal_not_reached_penalty
|
||||
self.goal_reached_reward = goal_reached_reward
|
||||
self.terminate_on_goal_reaching = terminate_on_goal_reaching
|
||||
self.time_limit = time_limit
|
||||
self.current_episode_steps_counter = 0
|
||||
self.random_goals_instead_of_standing_goal = random_goals_instead_of_standing_goal
|
||||
self.polar_coordinates = polar_coordinates
|
||||
|
||||
# spaces definition
|
||||
self.action_space = spaces.Box(low=-self.sim.model.actuator_ctrlrange[:, 1],
|
||||
high=self.sim.model.actuator_ctrlrange[:, 1],
|
||||
dtype=np.float32)
|
||||
if self.polar_coordinates:
|
||||
self.observation_space = spaces.Dict({
|
||||
"observation": spaces.Box(low=np.array([-np.pi, -15]),
|
||||
high=np.array([np.pi, 15]),
|
||||
dtype=np.float32),
|
||||
"desired_goal": spaces.Box(low=np.array([-np.pi, -15]),
|
||||
high=np.array([np.pi, 15]),
|
||||
dtype=np.float32),
|
||||
"achieved_goal": spaces.Box(low=np.array([-np.pi, -15]),
|
||||
high=np.array([np.pi, 15]),
|
||||
dtype=np.float32)
|
||||
})
|
||||
else:
|
||||
self.observation_space = spaces.Dict({
|
||||
"observation": spaces.Box(low=np.array([-1, -1, -15]),
|
||||
high=np.array([1, 1, 15]),
|
||||
dtype=np.float32),
|
||||
"desired_goal": spaces.Box(low=np.array([-1, -1, -15]),
|
||||
high=np.array([1, 1, 15]),
|
||||
dtype=np.float32),
|
||||
"achieved_goal": spaces.Box(low=np.array([-1, -1, -15]),
|
||||
high=np.array([1, 1, 15]),
|
||||
dtype=np.float32)
|
||||
})
|
||||
|
||||
self.spec = EnvSpec('PendulumWithGoals-v0')
|
||||
self.spec.reward_threshold = self.goal_not_reached_penalty * self.time_limit
|
||||
|
||||
self.reset()
|
||||
|
||||
def _goal_reached(self):
|
||||
observation = self._get_obs()
|
||||
if np.any(np.abs(observation['achieved_goal'] - observation['desired_goal']) > self.goal_reaching_thresholds):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def _terminate(self):
|
||||
if (self._goal_reached() and self.terminate_on_goal_reaching) or \
|
||||
self.current_episode_steps_counter >= self.time_limit:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _reward(self):
|
||||
if self._goal_reached():
|
||||
return self.goal_reached_reward
|
||||
else:
|
||||
return self.goal_not_reached_penalty
|
||||
|
||||
def step(self, action):
|
||||
self.sim.data.ctrl[:] = action
|
||||
for _ in range(self.frameskip):
|
||||
self.sim.step()
|
||||
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
state = self._get_obs()
|
||||
|
||||
# visualize the angular velocities
|
||||
state_velocity = np.copy(state['observation'][-1] / 20)
|
||||
goal_velocity = self.goal[-1] / 20
|
||||
self.sim.model.site_size[2] = np.array([0.01, 0.01, state_velocity])
|
||||
self.sim.data.mocap_pos[2] = np.array([0.85, 0, 0.75 + state_velocity])
|
||||
self.sim.model.site_size[3] = np.array([0.01, 0.01, goal_velocity])
|
||||
self.sim.data.mocap_pos[3] = np.array([1.15, 0, 0.75 + goal_velocity])
|
||||
|
||||
return state, self._reward(), self._terminate(), {}
|
||||
|
||||
def _get_obs(self):
|
||||
|
||||
"""
|
||||
y
|
||||
|
||||
^
|
||||
|____
|
||||
| /
|
||||
| /
|
||||
|~/
|
||||
|/
|
||||
--------> x
|
||||
|
||||
"""
|
||||
|
||||
# observation
|
||||
angle = self.sim.data.qpos
|
||||
angular_velocity = self.sim.data.qvel
|
||||
if self.polar_coordinates:
|
||||
observation = np.concatenate([angle - np.pi, angular_velocity])
|
||||
else:
|
||||
x = np.sin(angle)
|
||||
y = np.cos(angle) # qpos is the angle relative to a standing pole
|
||||
observation = np.concatenate([x, y, angular_velocity])
|
||||
|
||||
return {
|
||||
"observation": observation,
|
||||
"desired_goal": self.goal,
|
||||
"achieved_goal": observation
|
||||
}
|
||||
|
||||
def reset(self):
|
||||
self.current_episode_steps_counter = 0
|
||||
|
||||
# set initial state
|
||||
angle = np.random.uniform(np.pi / 4, 7 * np.pi / 4)
|
||||
angular_velocity = np.random.uniform(-0.05, 0.05)
|
||||
self.sim.data.qpos[0] = angle
|
||||
self.sim.data.qvel[0] = angular_velocity
|
||||
self.sim.step()
|
||||
|
||||
# goal
|
||||
if self.random_goals_instead_of_standing_goal:
|
||||
angle_target = np.random.uniform(-np.pi / 8, np.pi / 8)
|
||||
angular_velocity_target = np.random.uniform(-0.2, 0.2)
|
||||
else:
|
||||
angle_target = 0
|
||||
angular_velocity_target = 0
|
||||
|
||||
# convert target values to goal
|
||||
x_target = np.sin(angle_target)
|
||||
y_target = np.cos(angle_target)
|
||||
if self.polar_coordinates:
|
||||
self.goal = np.array([angle_target - np.pi, angular_velocity_target])
|
||||
else:
|
||||
self.goal = np.array([x_target, y_target, angular_velocity_target])
|
||||
|
||||
# visualize the goal
|
||||
self.sim.data.mocap_pos[0] = [x_target, 0, y_target]
|
||||
|
||||
return self._get_obs()
|
||||
|
||||
def render(self, mode='human', close=False):
|
||||
if mode == 'human':
|
||||
if self.viewer is None:
|
||||
self.viewer = MjViewer(self.sim)
|
||||
self.viewer.render()
|
||||
elif mode == 'rgb_array':
|
||||
if self.rgb_viewer is None:
|
||||
self.rgb_viewer = MjRenderContextOffscreen(self.sim, 0)
|
||||
self.rgb_viewer.render(500, 500)
|
||||
# window size used for old mujoco-py:
|
||||
data = self.rgb_viewer.read_pixels(500, 500, depth=False)
|
||||
# original image is upside-down, so flip it
|
||||
return data[::-1, :, :]
|
||||
42
rl_coach/environments/mujoco/pendulum_with_goals.xml
Normal file
42
rl_coach/environments/mujoco/pendulum_with_goals.xml
Normal file
@@ -0,0 +1,42 @@
|
||||
<mujoco model="pendulum_with_goals">
|
||||
<include file="./common/visual.xml"/>
|
||||
<include file="./common/skybox.xml"/>
|
||||
<include file="./common/materials.xml"/>
|
||||
|
||||
<option timestep="0.002">
|
||||
<flag contact="disable" energy="enable"/>
|
||||
</option>
|
||||
|
||||
<worldbody>
|
||||
<light name="light" pos="0 0 2"/>
|
||||
<geom name="floor" size="2 2 .2" type="plane" material="grid"/>
|
||||
<camera name="fixed" pos="0 -1.5 2" xyaxes='1 0 0 0 1 1'/>
|
||||
<camera name="lookat" mode="targetbodycom" target="pole" pos="0 -2 1"/>
|
||||
<body name="pole" pos="0 0 .6">
|
||||
<joint name="hinge" type="hinge" axis="0 1 0" damping="0.1"/>
|
||||
<geom name="base" material="decoration" type="cylinder" fromto="0 -.03 0 0 .03 0" size="0.021" mass="0"/>
|
||||
<geom name="pole" material="self" type="capsule" fromto="0 0 0 0 0 0.5" size="0.02" mass="0"/>
|
||||
<geom name="mass" material="effector" type="sphere" pos="0 0 0.5" size="0.05" mass="1"/>
|
||||
</body>
|
||||
|
||||
<body name="end_goal" pos="0 0 0" mocap="true">
|
||||
<site type="sphere" size="0.05" rgba="1 1 0 1" />
|
||||
</body>
|
||||
<!--<body name="sub_goal" pos="0 0 0" mocap="true">-->
|
||||
<!--<site type="sphere" size="0.05" rgba="1 0 1 1" />-->
|
||||
<!--</body>-->
|
||||
<body name="current_velo" pos="0.0 0 0.0" mocap="true">
|
||||
<site type="box" size="0.01 0.01 0.1" rgba="1 1 1 1" />
|
||||
</body>
|
||||
<body name="subgoal_velo" pos="0.0 0 0.0" mocap="true">
|
||||
<site type="box" size="0.01 0.01 0.1" rgba="1 0 1 1" />
|
||||
</body>
|
||||
<body name="zero_velo" pos="1.0 0 0.75" mocap="true">
|
||||
<site type="box" size="0.3 0.01 0.01" rgba="1 0 0 1" />
|
||||
</body>
|
||||
</worldbody>
|
||||
|
||||
<actuator>
|
||||
<motor name="torque" joint="hinge" gear="1" ctrlrange="-2 2" ctrllimited="true"/>
|
||||
</actuator>
|
||||
</mujoco>
|
||||
245
rl_coach/environments/starcraft2_environment.py
Normal file
245
rl_coach/environments/starcraft2_environment.py
Normal file
@@ -0,0 +1,245 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
from enum import Enum
|
||||
from typing import Union, List
|
||||
|
||||
import numpy as np
|
||||
from rl_coach.filters.observation.observation_move_axis_filter import ObservationMoveAxisFilter
|
||||
|
||||
try:
|
||||
from pysc2 import maps
|
||||
from pysc2.env import sc2_env
|
||||
from pysc2.env import available_actions_printer
|
||||
from pysc2.lib import actions
|
||||
from pysc2.lib import features
|
||||
from pysc2.env import environment
|
||||
from absl import app
|
||||
from absl import flags
|
||||
except ImportError:
|
||||
from rl_coach.logger import failed_imports
|
||||
failed_imports.append("PySc2")
|
||||
|
||||
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.spaces import BoxActionSpace, VectorObservationSpace, PlanarMapsObservationSpace, StateSpace, CompoundActionSpace, \
|
||||
DiscreteActionSpace
|
||||
from rl_coach.filters.filter import InputFilter, OutputFilter
|
||||
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
|
||||
from rl_coach.filters.action.linear_box_to_box_map import LinearBoxToBoxMap
|
||||
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
|
||||
|
||||
FLAGS = flags.FLAGS
|
||||
FLAGS(['coach.py'])
|
||||
|
||||
SCREEN_SIZE = 84 # will also impact the action space size
|
||||
|
||||
# Starcraft Constants
|
||||
_NOOP = actions.FUNCTIONS.no_op.id
|
||||
_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
|
||||
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
|
||||
_PLAYER_RELATIVE = features.SCREEN_FEATURES.player_relative.index
|
||||
_NOT_QUEUED = [0]
|
||||
_SELECT_ALL = [0]
|
||||
|
||||
|
||||
class StarcraftObservationType(Enum):
|
||||
Features = 0
|
||||
RGB = 1
|
||||
|
||||
|
||||
StarcraftInputFilter = InputFilter(is_a_reference_filter=True)
|
||||
StarcraftInputFilter.add_observation_filter('screen', 'move_axis', ObservationMoveAxisFilter(0, -1))
|
||||
StarcraftInputFilter.add_observation_filter('screen', 'rescaling',
|
||||
ObservationRescaleToSizeFilter(
|
||||
PlanarMapsObservationSpace(np.array([84, 84, 1]),
|
||||
low=0, high=255, channels_axis=-1)))
|
||||
StarcraftInputFilter.add_observation_filter('screen', 'to_uint8', ObservationToUInt8Filter(0, 255))
|
||||
|
||||
StarcraftInputFilter.add_observation_filter('minimap', 'move_axis', ObservationMoveAxisFilter(0, -1))
|
||||
StarcraftInputFilter.add_observation_filter('minimap', 'rescaling',
|
||||
ObservationRescaleToSizeFilter(
|
||||
PlanarMapsObservationSpace(np.array([64, 64, 1]),
|
||||
low=0, high=255, channels_axis=-1)))
|
||||
StarcraftInputFilter.add_observation_filter('minimap', 'to_uint8', ObservationToUInt8Filter(0, 255))
|
||||
|
||||
|
||||
StarcraftNormalizingOutputFilter = OutputFilter(is_a_reference_filter=True)
|
||||
StarcraftNormalizingOutputFilter.add_action_filter(
|
||||
'normalization', LinearBoxToBoxMap(input_space_low=-SCREEN_SIZE / 2, input_space_high=SCREEN_SIZE / 2 - 1))
|
||||
|
||||
|
||||
class StarCraft2EnvironmentParameters(EnvironmentParameters):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.screen_size = 84
|
||||
self.minimap_size = 64
|
||||
self.feature_minimap_maps_to_use = range(7)
|
||||
self.feature_screen_maps_to_use = range(17)
|
||||
self.observation_type = StarcraftObservationType.Features
|
||||
self.disable_fog = False
|
||||
self.auto_select_all_army = True
|
||||
self.default_input_filter = StarcraftInputFilter
|
||||
self.default_output_filter = StarcraftNormalizingOutputFilter
|
||||
self.use_full_action_space = False
|
||||
|
||||
|
||||
@property
|
||||
def path(self):
|
||||
return 'rl_coach.environments.starcraft2_environment:StarCraft2Environment'
|
||||
|
||||
|
||||
# Environment
|
||||
class StarCraft2Environment(Environment):
|
||||
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
|
||||
seed: Union[None, int]=None, human_control: bool=False,
|
||||
custom_reward_threshold: Union[int, float]=None,
|
||||
screen_size: int=84, minimap_size: int=64,
|
||||
feature_minimap_maps_to_use: List=range(7), feature_screen_maps_to_use: List=range(17),
|
||||
observation_type: StarcraftObservationType=StarcraftObservationType.Features,
|
||||
disable_fog: bool=False, auto_select_all_army: bool=True,
|
||||
use_full_action_space: bool=False, **kwargs):
|
||||
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
|
||||
|
||||
self.screen_size = screen_size
|
||||
self.minimap_size = minimap_size
|
||||
self.feature_minimap_maps_to_use = feature_minimap_maps_to_use
|
||||
self.feature_screen_maps_to_use = feature_screen_maps_to_use
|
||||
self.observation_type = observation_type
|
||||
self.features_screen_size = None
|
||||
self.feature_minimap_size = None
|
||||
self.rgb_screen_size = None
|
||||
self.rgb_minimap_size = None
|
||||
if self.observation_type == StarcraftObservationType.Features:
|
||||
self.features_screen_size = screen_size
|
||||
self.feature_minimap_size = minimap_size
|
||||
elif self.observation_type == StarcraftObservationType.RGB:
|
||||
self.rgb_screen_size = screen_size
|
||||
self.rgb_minimap_size = minimap_size
|
||||
self.disable_fog = disable_fog
|
||||
self.auto_select_all_army = auto_select_all_army
|
||||
self.use_full_action_space = use_full_action_space
|
||||
|
||||
# step_mul is the equivalent to frame skipping. Not sure if it repeats actions in between or not though.
|
||||
self.env = sc2_env.SC2Env(map_name=self.env_id, step_mul=frame_skip,
|
||||
visualize=self.is_rendered,
|
||||
agent_interface_format=sc2_env.AgentInterfaceFormat(
|
||||
feature_dimensions=sc2_env.Dimensions(
|
||||
screen=self.features_screen_size,
|
||||
minimap=self.feature_minimap_size
|
||||
)
|
||||
# rgb_dimensions=sc2_env.Dimensions(
|
||||
# screen=self.rgb_screen_size,
|
||||
# minimap=self.rgb_screen_size
|
||||
# )
|
||||
),
|
||||
# feature_screen_size=self.features_screen_size,
|
||||
# feature_minimap_size=self.feature_minimap_size,
|
||||
# rgb_screen_size=self.rgb_screen_size,
|
||||
# rgb_minimap_size=self.rgb_screen_size,
|
||||
disable_fog=disable_fog,
|
||||
random_seed=self.seed
|
||||
)
|
||||
|
||||
# print all the available actions
|
||||
# self.env = available_actions_printer.AvailableActionsPrinter(self.env)
|
||||
|
||||
self.reset_internal_state(True)
|
||||
|
||||
"""
|
||||
feature_screen: [height_map, visibility_map, creep, power, player_id, player_relative, unit_type, selected,
|
||||
unit_hit_points, unit_hit_points_ratio, unit_energy, unit_energy_ratio, unit_shields,
|
||||
unit_shields_ratio, unit_density, unit_density_aa, effects]
|
||||
feature_minimap: [height_map, visibility_map, creep, camera, player_id, player_relative, selecte
|
||||
d]
|
||||
player: [player_id, minerals, vespene, food_cap, food_army, food_workers, idle_worker_dount,
|
||||
army_count, warp_gate_count, larva_count]
|
||||
"""
|
||||
self.screen_shape = np.array(self.env.observation_spec()[0]['feature_screen'])
|
||||
self.screen_shape[0] = len(self.feature_screen_maps_to_use)
|
||||
self.minimap_shape = np.array(self.env.observation_spec()[0]['feature_minimap'])
|
||||
self.minimap_shape[0] = len(self.feature_minimap_maps_to_use)
|
||||
self.state_space = StateSpace({
|
||||
"screen": PlanarMapsObservationSpace(shape=self.screen_shape, low=0, high=255, channels_axis=0),
|
||||
"minimap": PlanarMapsObservationSpace(shape=self.minimap_shape, low=0, high=255, channels_axis=0),
|
||||
"measurements": VectorObservationSpace(self.env.observation_spec()[0]["player"][0])
|
||||
})
|
||||
if self.use_full_action_space:
|
||||
action_identifiers = list(self.env.action_spec()[0].functions)
|
||||
num_action_identifiers = len(action_identifiers)
|
||||
action_arguments = [(arg.name, arg.sizes) for arg in self.env.action_spec()[0].types]
|
||||
sub_action_spaces = [DiscreteActionSpace(num_action_identifiers)]
|
||||
for argument in action_arguments:
|
||||
for dimension in argument[1]:
|
||||
sub_action_spaces.append(DiscreteActionSpace(dimension))
|
||||
self.action_space = CompoundActionSpace(sub_action_spaces)
|
||||
else:
|
||||
self.action_space = BoxActionSpace(2, 0, self.screen_size - 1, ["X-Axis, Y-Axis"],
|
||||
default_action=np.array([self.screen_size/2, self.screen_size/2]))
|
||||
|
||||
def _update_state(self):
|
||||
timestep = 0
|
||||
self.screen = self.last_result[timestep].observation.feature_screen
|
||||
# extract only the requested segmentation maps from the observation
|
||||
self.screen = np.take(self.screen, self.feature_screen_maps_to_use, axis=0)
|
||||
self.minimap = self.last_result[timestep].observation.feature_minimap
|
||||
self.measurements = self.last_result[timestep].observation.player
|
||||
self.reward = self.last_result[timestep].reward
|
||||
self.done = self.last_result[timestep].step_type == environment.StepType.LAST
|
||||
self.state = {
|
||||
'screen': self.screen,
|
||||
'minimap': self.minimap,
|
||||
'measurements': self.measurements
|
||||
}
|
||||
|
||||
def _take_action(self, action):
|
||||
if self.use_full_action_space:
|
||||
action_identifier = action[0]
|
||||
action_arguments = action[1:]
|
||||
action = actions.FunctionCall(action_identifier, action_arguments)
|
||||
else:
|
||||
coord = np.array(action[0:2])
|
||||
noop = False
|
||||
coord = coord.round()
|
||||
coord = np.clip(coord, 0, SCREEN_SIZE - 1)
|
||||
self.last_action_idx = coord
|
||||
|
||||
if noop:
|
||||
action = actions.FunctionCall(_NOOP, [])
|
||||
else:
|
||||
action = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
|
||||
|
||||
self.last_result = self.env.step(actions=[action])
|
||||
|
||||
def _restart_environment_episode(self, force_environment_reset=False):
|
||||
# reset the environment
|
||||
self.last_result = self.env.reset()
|
||||
|
||||
# select all the units on the screen
|
||||
if self.auto_select_all_army:
|
||||
self.env.step(actions=[actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
|
||||
|
||||
def get_rendered_image(self):
|
||||
screen = np.squeeze(np.tile(np.expand_dims(self.screen, -1), (1, 1, 3)))
|
||||
screen = screen / np.max(screen) * 255
|
||||
return screen.astype('uint8')
|
||||
|
||||
def dump_video_of_last_episode(self):
|
||||
from rl_coach.logger import experiment_path
|
||||
self.env._run_config.replay_dir = experiment_path
|
||||
self.env.save_replay('replays')
|
||||
super().dump_video_of_last_episode()
|
||||
0
rl_coach/environments/toy_problems/__init__.py
Normal file
0
rl_coach/environments/toy_problems/__init__.py
Normal file
82
rl_coach/environments/toy_problems/bit_flip.py
Normal file
82
rl_coach/environments/toy_problems/bit_flip.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym import spaces
|
||||
import random
|
||||
|
||||
|
||||
class BitFlip(gym.Env):
|
||||
metadata = {
|
||||
'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
|
||||
}
|
||||
|
||||
def __init__(self, bit_length=16, max_steps=None, mean_zero=False):
|
||||
super(BitFlip, self).__init__()
|
||||
if bit_length < 1:
|
||||
raise ValueError('bit_length must be >= 1, found {}'.format(bit_length))
|
||||
self.bit_length = bit_length
|
||||
self.mean_zero = mean_zero
|
||||
|
||||
if max_steps is None:
|
||||
# default to bit_length
|
||||
self.max_steps = bit_length
|
||||
elif max_steps == 0:
|
||||
self.max_steps = None
|
||||
else:
|
||||
self.max_steps = max_steps
|
||||
|
||||
# spaces documentation: https://gym.openai.com/docs/
|
||||
self.action_space = spaces.Discrete(bit_length)
|
||||
self.observation_space = spaces.Dict({
|
||||
'state': spaces.Box(low=0, high=1, shape=(bit_length, )),
|
||||
'desired_goal': spaces.Box(low=0, high=1, shape=(bit_length, )),
|
||||
'achieved_goal': spaces.Box(low=0, high=1, shape=(bit_length, ))
|
||||
})
|
||||
|
||||
self.reset()
|
||||
|
||||
def _terminate(self):
|
||||
return (self.state == self.goal).all() or self.steps >= self.max_steps
|
||||
|
||||
def _reward(self):
|
||||
return -1 if (self.state != self.goal).any() else 0
|
||||
|
||||
def step(self, action):
|
||||
# action is an int in the range [0, self.bit_length)
|
||||
self.state[action] = int(not self.state[action])
|
||||
self.steps += 1
|
||||
|
||||
return (self._get_obs(), self._reward(), self._terminate(), {})
|
||||
|
||||
def reset(self):
|
||||
self.steps = 0
|
||||
|
||||
self.state = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
|
||||
|
||||
# make sure goal is not the initial state
|
||||
self.goal = self.state
|
||||
while (self.goal == self.state).all():
|
||||
self.goal = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
|
||||
|
||||
return self._get_obs()
|
||||
|
||||
def _mean_zero(self, x):
|
||||
if self.mean_zero:
|
||||
return (x - 0.5) / 0.5
|
||||
else:
|
||||
return x
|
||||
|
||||
def _get_obs(self):
|
||||
return {
|
||||
'state': self._mean_zero(self.state),
|
||||
'desired_goal': self._mean_zero(self.goal),
|
||||
'achieved_goal': self._mean_zero(self.state)
|
||||
}
|
||||
|
||||
def render(self, mode='human', close=False):
|
||||
observation = np.zeros((20, 20 * self.bit_length, 3))
|
||||
for bit_idx, (state_bit, goal_bit) in enumerate(zip(self.state, self.goal)):
|
||||
# green if the bit matches
|
||||
observation[:, bit_idx * 20:(bit_idx + 1) * 20, 1] = (state_bit == goal_bit) * 255
|
||||
# red if the bit doesn't match
|
||||
observation[:, bit_idx * 20:(bit_idx + 1) * 20, 0] = (state_bit != goal_bit) * 255
|
||||
return observation
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user