1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-18 03:30:19 +01:00

pre-release 0.10.0

This commit is contained in:
Gal Novik
2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions

0
rl_coach/__init__.py Normal file
View File

View File

@@ -0,0 +1,15 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,165 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
import scipy.signal
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.core_types import QActionStateValue
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import last_sample
from rl_coach.logger import screen
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class ActorCriticAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.A_VALUE
self.apply_gradients_every_x_episodes = 5
self.beta_entropy = 0
self.num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
self.gae_lambda = 0.96
self.estimate_state_value_using_gae = False
class ActorCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [VHeadParameters(), PolicyHeadParameters()]
self.loss_weights = [0.5, 1.0]
self.rescale_gradient_from_head_by_factor = [1, 1]
self.optimizer_type = 'Adam'
self.clip_gradients = 40.0
self.async_training = True
class ActorCriticAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ActorCriticAlgorithmParameters(),
exploration=None, #TODO this should be different for continuous (ContinuousEntropyExploration)
# and discrete (CategoricalExploration) action spaces.
memory=SingleEpisodeBufferParameters(),
networks={"main": ActorCriticNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.actor_critic_agent:ActorCriticAgent'
# Actor Critic - https://arxiv.org/abs/1602.01783
class ActorCriticAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.last_gradient_update_step_idx = 0
self.action_advantages = self.register_signal('Advantages')
self.state_values = self.register_signal('Values')
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
# Discounting function used to calculate discounted returns.
def discount(self, x, gamma):
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
def get_general_advantage_estimation_values(self, rewards, values):
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
# Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
# although in practice works even in much smaller Tmax values, e.g. 20)
deltas = rewards + self.ap.algorithm.discount * values[1:] - values[:-1]
gae = self.discount(deltas, self.ap.algorithm.discount * self.ap.algorithm.gae_lambda)
if self.ap.algorithm.estimate_state_value_using_gae:
discounted_returns = np.expand_dims(gae + values[:-1], -1)
else:
discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
self.ap.algorithm.discount)), 1)[:-1]
return gae, discounted_returns
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
result = self.networks['main'].online_network.predict(batch.states(network_keys))
current_state_values = result[0]
self.state_values.add_sample(current_state_values)
# the targets for the state value estimator
num_transitions = batch.size
state_value_head_targets = np.zeros((num_transitions, 1))
# estimate the advantage function
action_advantages = np.zeros((num_transitions, 1))
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
if batch.game_overs()[-1]:
R = 0
else:
R = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
for i in reversed(range(num_transitions)):
R = batch.rewards()[i] + self.ap.algorithm.discount * R
state_value_head_targets[i] = R
action_advantages[i] = R - current_state_values[i]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
bootstrapped_value = self.networks['main'].online_network.predict(last_sample(batch.next_states(network_keys)))[0]
values = np.append(current_state_values, bootstrapped_value)
if batch.game_overs()[-1]:
values[-1] = 0
# get general discounted returns table
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(batch.rewards(), values)
action_advantages = np.vstack(gae_values)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
action_advantages = action_advantages.squeeze(axis=-1)
actions = batch.actions()
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) < 2:
actions = np.expand_dims(actions, -1)
# train
result = self.networks['main'].online_network.accumulate_gradients({**batch.states(network_keys),
'output_1_0': actions},
[state_value_head_targets, action_advantages])
# logging
total_loss, losses, unclipped_grads = result[:3]
self.action_advantages.add_sample(action_advantages)
self.unclipped_grads.add_sample(unclipped_grads)
self.value_loss.add_sample(losses[0])
self.policy_loss.add_sample(losses[1])
return total_loss, losses, unclipped_grads
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)[1:] # index 0 is the state value

791
rl_coach/agents/agent.py Normal file
View File

@@ -0,0 +1,791 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import random
from collections import OrderedDict
from typing import Dict, List, Union, Tuple
import numpy as np
from rl_coach.agents.agent_interface import AgentInterface
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
from rl_coach.core_types import RunPhase, PredictionType, EnvironmentEpisodes, ActionType, Batch, Episode, StateType
from rl_coach.core_types import Transition, ActionInfo, TrainingSteps, EnvironmentSteps, EnvResponse
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
from pandas import read_pickle
from six.moves import range
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace, GoalsSpace, AttentionActionSpace
from rl_coach.utils import Signal, force_list, set_cpu
from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
from rl_coach.architectures.network_wrapper import NetworkWrapper
from rl_coach.logger import screen, Logger, EpisodeLogger
class Agent(AgentInterface):
def __init__(self, agent_parameters: AgentParameters, parent: Union['LevelManager', 'CompositeAgent']=None):
"""
:param agent_parameters: A Preset class instance with all the running paramaters
"""
super().__init__()
self.ap = agent_parameters
self.task_id = self.ap.task_parameters.task_index
self.is_chief = self.task_id == 0
self.shared_memory = type(agent_parameters.task_parameters) == DistributedTaskParameters \
and self.ap.memory.shared_memory
if self.shared_memory:
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
self.name = agent_parameters.name
self.parent = parent
self.parent_level_manager = None
self.full_name_id = agent_parameters.full_name_id = self.name
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
screen.log_title("Creating agent - name: {} task id: {} (may take up to 30 seconds due to "
"tensorflow wake up time)".format(self.full_name_id, self.task_id))
else:
screen.log_title("Creating agent - name: {}".format(self.full_name_id))
self.imitation = False
self.agent_logger = Logger()
self.agent_episode_logger = EpisodeLogger()
# get the memory
# - distributed training + shared memory:
# * is chief? -> create the memory and add it to the scratchpad
# * not chief? -> wait for the chief to create the memory and then fetch it
# - non distributed training / not shared memory:
# * create memory
memory_name = self.ap.memory.path.split(':')[1]
self.memory_lookup_name = self.full_name_id + '.' + memory_name
if self.shared_memory and not self.is_chief:
self.memory = self.shared_memory_scratchpad.get(self.memory_lookup_name)
else:
# modules
if agent_parameters.memory.load_memory_from_file_path:
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
.format(agent_parameters.memory.load_memory_from_file_path))
self.memory = read_pickle(agent_parameters.memory.load_memory_from_file_path)
else:
self.memory = dynamic_import_and_instantiate_module_from_params(self.ap.memory)
if self.shared_memory and self.is_chief:
self.shared_memory_scratchpad.add(self.memory_lookup_name, self.memory)
# set devices
if type(agent_parameters.task_parameters) == DistributedTaskParameters:
self.has_global = True
self.replicated_device = agent_parameters.task_parameters.device
self.worker_device = "/job:worker/task:{}".format(self.task_id)
else:
self.has_global = False
self.replicated_device = None
self.worker_device = ""
if agent_parameters.task_parameters.use_cpu:
self.worker_device += "/cpu:0"
else:
self.worker_device += "/device:GPU:0"
# filters
self.input_filter = self.ap.input_filter
self.output_filter = self.ap.output_filter
self.pre_network_filter = self.ap.pre_network_filter
device = self.replicated_device if self.replicated_device else self.worker_device
self.input_filter.set_device(device)
self.output_filter.set_device(device)
self.pre_network_filter.set_device(device)
# initialize all internal variables
self._phase = RunPhase.HEATUP
self.total_shaped_reward_in_current_episode = 0
self.total_reward_in_current_episode = 0
self.total_steps_counter = 0
self.running_reward = None
self.training_iteration = 0
self.last_target_network_update_step = 0
self.last_training_phase_step = 0
self.current_episode = self.ap.current_episode = 0
self.curr_state = {}
self.current_hrl_goal = None
self.current_episode_steps_counter = 0
self.episode_running_info = {}
self.last_episode_evaluation_ran = 0
self.running_observations = []
self.agent_logger.set_current_time(self.current_episode)
self.exploration_policy = None
self.networks = {}
self.last_action_info = None
self.running_observation_stats = None
self.running_reward_stats = None
self.accumulated_rewards_across_evaluation_episodes = 0
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
self.num_successes_across_evaluation_episodes = 0
self.num_evaluation_episodes_completed = 0
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
# TODO: add agents observation rendering for debugging purposes (not the same as the environment rendering)
# environment parameters
self.spaces = None
self.in_action_space = self.ap.algorithm.in_action_space
# signals
self.episode_signals = []
self.step_signals = []
self.loss = self.register_signal('Loss')
self.curr_learning_rate = self.register_signal('Learning Rate')
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.reward = self.register_signal('Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
self.shaped_reward = self.register_signal('Shaped Reward', dump_one_value_per_episode=False, dump_one_value_per_step=True)
if isinstance(self.in_action_space, GoalsSpace):
self.distance_from_goal = self.register_signal('Distance From Goal', dump_one_value_per_step=True)
# use seed
if self.ap.task_parameters.seed is not None:
random.seed(self.ap.task_parameters.seed)
np.random.seed(self.ap.task_parameters.seed)
@property
def parent(self):
"""
Get the parent class of the agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the agent.
Additionally, updates the full name of the agent
:param val: the new parent
:return: None
"""
self._parent = val
if self._parent is not None:
if not hasattr(self._parent, 'name'):
raise ValueError("The parent of an agent must have a name")
self.full_name_id = self.ap.full_name_id = "{}/{}".format(self._parent.name, self.name)
def setup_logger(self):
# dump documentation
logger_prefix = "{graph_name}.{level_name}.{agent_full_id}".\
format(graph_name=self.parent_level_manager.parent_graph_manager.name,
level_name=self.parent_level_manager.name,
agent_full_id='.'.join(self.full_name_id.split('/')))
self.agent_logger.set_logger_filenames(self.ap.task_parameters.experiment_path, logger_prefix=logger_prefix,
add_timestamp=True, task_id=self.task_id)
if self.ap.visualization.dump_in_episode_signals:
self.agent_episode_logger.set_logger_filenames(self.ap.task_parameters.experiment_path,
logger_prefix=logger_prefix,
add_timestamp=True, task_id=self.task_id)
def set_session(self, sess) -> None:
"""
Set the deep learning framework session for all the agents in the composite agent
:return: None
"""
self.input_filter.set_session(sess)
self.output_filter.set_session(sess)
self.pre_network_filter.set_session(sess)
[network.set_session(sess) for network in self.networks.values()]
def register_signal(self, signal_name: str, dump_one_value_per_episode: bool=True,
dump_one_value_per_step: bool=False) -> Signal:
"""
Register a signal such that its statistics will be dumped and be viewable through dashboard
:param signal_name: the name of the signal as it will appear in dashboard
:param dump_one_value_per_episode: should the signal value be written for each episode?
:param dump_one_value_per_step: should the signal value be written for each step?
:return: the created signal
"""
signal = Signal(signal_name)
if dump_one_value_per_episode:
self.episode_signals.append(signal)
if dump_one_value_per_step:
self.step_signals.append(signal)
return signal
def set_environment_parameters(self, spaces: SpacesDefinition):
"""
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
dependent on those values, by calling init_environment_dependent_modules
:param spaces: the environment spaces definition
:return: None
"""
self.spaces = copy.deepcopy(spaces)
if self.ap.algorithm.use_accumulated_reward_as_measurement:
if 'measurements' in self.spaces.state.sub_spaces:
self.spaces.state['measurements'].shape += 1
self.spaces.state['measurements'].measurements_names += ['accumulated_reward']
else:
self.spaces.state['measurements'] = VectorObservationSpace(1, measurements_names=['accumulated_reward'])
for observation_name in self.spaces.state.sub_spaces.keys():
self.spaces.state[observation_name] = \
self.pre_network_filter.get_filtered_observation_space(observation_name,
self.input_filter.get_filtered_observation_space(observation_name,
self.spaces.state[observation_name]))
self.spaces.reward = self.pre_network_filter.get_filtered_reward_space(
self.input_filter.get_filtered_reward_space(self.spaces.reward))
self.spaces.action = self.output_filter.get_unfiltered_action_space(self.spaces.action)
if isinstance(self.in_action_space, GoalsSpace):
# TODO: what if the goal type is an embedding / embedding change?
self.spaces.goal = self.in_action_space
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
self.init_environment_dependent_modules()
def create_networks(self) -> Dict[str, NetworkWrapper]:
"""
Create all the networks of the agent.
The network creation will be done after setting the environment parameters for the agent, since they are needed
for creating the network.
:return: A list containing all the networks
"""
networks = {}
for network_name in sorted(self.ap.network_wrappers.keys()):
networks[network_name] = NetworkWrapper(name=network_name,
agent_parameters=self.ap,
has_target=self.ap.network_wrappers[network_name].create_target_network,
has_global=self.has_global,
spaces=self.spaces,
replicated_device=self.replicated_device,
worker_device=self.worker_device)
return networks
def init_environment_dependent_modules(self) -> None:
"""
Initialize any modules that depend on knowing information about the environment such as the action space or
the observation space
:return: None
"""
# initialize exploration policy
self.ap.exploration.action_space = self.spaces.action
self.exploration_policy = dynamic_import_and_instantiate_module_from_params(self.ap.exploration)
# create all the networks of the agent
self.networks = self.create_networks()
@property
def phase(self) -> RunPhase:
return self._phase
@phase.setter
def phase(self, val: RunPhase) -> None:
"""
Change the phase of the run for the agent and all the sub components
:param phase: the new run phase (TRAIN, TEST, etc.)
:return: None
"""
self.reset_evaluation_state(val)
self._phase = val
self.exploration_policy.change_phase(val)
def reset_evaluation_state(self, val: RunPhase) -> None:
starting_evaluation = (val == RunPhase.TEST)
ending_evaluation = (self.phase == RunPhase.TEST)
if starting_evaluation:
self.accumulated_rewards_across_evaluation_episodes = 0
self.accumulated_shaped_rewards_across_evaluation_episodes = 0
self.num_successes_across_evaluation_episodes = 0
self.num_evaluation_episodes_completed = 0
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
screen.log_title("{}: Starting evaluation phase".format(self.name))
elif ending_evaluation:
# we write to the next episode, because it could be that the current episode was already written
# to disk and then we won't write it again
self.agent_logger.set_current_time(self.current_episode + 1)
self.agent_logger.create_signal_value(
'Evaluation Reward',
self.accumulated_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
self.agent_logger.create_signal_value(
'Shaped Evaluation Reward',
self.accumulated_shaped_rewards_across_evaluation_episodes / self.num_evaluation_episodes_completed)
success_rate = self.num_successes_across_evaluation_episodes / self.num_evaluation_episodes_completed
self.agent_logger.create_signal_value(
"Success Rate",
success_rate
)
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
screen.log_title("{}: Finished evaluation phase. Success rate = {}"
.format(self.name, np.round(success_rate, 2)))
def call_memory(self, func, args=()):
"""
This function is a wrapper to allow having the same calls for shared or unshared memories.
It should be used instead of calling the memory directly in order to allow different algorithms to work
both with a shared and a local memory.
:param func: the name of the memory function to call
:param args: the arguments to supply to the function
:return: the return value of the function
"""
if self.shared_memory:
result = self.shared_memory_scratchpad.internal_call(self.memory_lookup_name, func, args)
else:
if type(args) != tuple:
args = (args,)
result = getattr(self.memory, func)(*args)
return result
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Name"] = self.full_name_id
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
log["Exploration"] = np.round(self.exploration_policy.get_control_param(), 2)
log["Steps"] = self.total_steps_counter
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix=self.phase.value)
def update_step_in_episode_log(self):
"""
Writes logging messages to screen and updates the log file with all the signal values.
:return: None
"""
# log all the signals to file
self.agent_episode_logger.set_current_time(self.current_episode_steps_counter)
self.agent_episode_logger.create_signal_value('Training Iter', self.training_iteration)
self.agent_episode_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
self.agent_episode_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
self.agent_episode_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
self.agent_episode_logger.create_signal_value('Total steps', self.total_steps_counter)
self.agent_episode_logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
self.agent_episode_logger.create_signal_value("Shaped Accumulated Reward", self.total_shaped_reward_in_current_episode)
self.agent_episode_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.agent_episode_logger.update_wall_clock_time(self.current_episode_steps_counter)
for signal in self.step_signals:
self.agent_episode_logger.create_signal_value(signal.name, signal.get_last_value())
# dump
self.agent_episode_logger.dump_output_csv()
def update_log(self):
"""
Writes logging messages to screen and updates the log file with all the signal values.
:return: None
"""
# log all the signals to file
self.agent_logger.set_current_time(self.current_episode)
self.agent_logger.create_signal_value('Training Iter', self.training_iteration)
self.agent_logger.create_signal_value('In Heatup', int(self._phase == RunPhase.HEATUP))
self.agent_logger.create_signal_value('ER #Transitions', self.call_memory('num_transitions'))
self.agent_logger.create_signal_value('ER #Episodes', self.call_memory('length'))
self.agent_logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
self.agent_logger.create_signal_value('Total steps', self.total_steps_counter)
self.agent_logger.create_signal_value("Epsilon", np.mean(self.exploration_policy.get_control_param()))
self.agent_logger.create_signal_value("Shaped Training Reward", self.total_shaped_reward_in_current_episode
if self._phase == RunPhase.TRAIN else np.nan)
self.agent_logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
if self._phase == RunPhase.TRAIN else np.nan)
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.agent_logger.update_wall_clock_time(self.current_episode)
if self._phase != RunPhase.TEST:
self.agent_logger.create_signal_value('Evaluation Reward', np.nan, overwrite=False)
self.agent_logger.create_signal_value('Shaped Evaluation Reward', np.nan, overwrite=False)
self.agent_logger.create_signal_value('Success Rate', np.nan, overwrite=False)
for signal in self.episode_signals:
self.agent_logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
self.agent_logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
self.agent_logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
self.agent_logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
# dump
if self.current_episode % self.ap.visualization.dump_signals_to_csv_every_x_episodes == 0 \
and self.current_episode > 0:
self.agent_logger.dump_output_csv()
def handle_episode_ended(self) -> None:
"""
End an episode
:return: None
"""
self.current_episode_buffer.is_complete = True
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
self.current_episode += 1
if self.phase != RunPhase.TEST and isinstance(self.memory, EpisodicExperienceReplay):
self.call_memory('store_episode', self.current_episode_buffer)
if self.phase == RunPhase.TEST:
self.accumulated_rewards_across_evaluation_episodes += self.total_reward_in_current_episode
self.accumulated_shaped_rewards_across_evaluation_episodes += self.total_shaped_reward_in_current_episode
self.num_evaluation_episodes_completed += 1
if self.spaces.reward.reward_success_threshold and \
self.total_reward_in_current_episode >= self.spaces.reward.reward_success_threshold:
self.num_successes_across_evaluation_episodes += 1
if self.ap.visualization.dump_csv:
self.update_log()
if self.ap.is_a_highest_level_agent or self.ap.task_parameters.verbosity == "high":
self.log_to_screen()
def reset_internal_state(self):
"""
Reset all the episodic parameters
:return: None
"""
for signal in self.episode_signals:
signal.reset()
for signal in self.step_signals:
signal.reset()
self.agent_episode_logger.set_episode_idx(self.current_episode)
self.total_shaped_reward_in_current_episode = 0
self.total_reward_in_current_episode = 0
self.curr_state = {}
self.current_episode_steps_counter = 0
self.episode_running_info = {}
self.current_episode_buffer = Episode(discount=self.ap.algorithm.discount)
if self.exploration_policy:
self.exploration_policy.reset()
self.input_filter.reset()
self.output_filter.reset()
self.pre_network_filter.reset()
if isinstance(self.memory, EpisodicExperienceReplay):
self.call_memory('verify_last_episode_is_closed')
for network in self.networks.values():
network.online_network.reset_internal_memory()
def learn_from_batch(self, batch) -> Tuple[float, List, List]:
"""
Given a batch of transitions, calculates their target values and updates the network.
:param batch: A list of transitions
:return: The total loss of the training, the loss per head and the unclipped gradients
"""
return 0, [], []
def _should_update_online_weights_to_target(self):
"""
Determine if online weights should be copied to the target.
:return: boolean: True if the online weights should be copied to the target.
"""
# update the target network of every network that has a target network
step_method = self.ap.algorithm.num_steps_between_copying_online_weights_to_target
if step_method.__class__ == TrainingSteps:
should_update = (self.training_iteration - self.last_target_network_update_step) >= step_method.num_steps
if should_update:
self.last_target_network_update_step = self.training_iteration
elif step_method.__class__ == EnvironmentSteps:
should_update = (self.total_steps_counter - self.last_target_network_update_step) >= step_method.num_steps
if should_update:
self.last_target_network_update_step = self.total_steps_counter
else:
raise ValueError("The num_steps_between_copying_online_weights_to_target parameter should be either "
"EnvironmentSteps or TrainingSteps. Instead it is {}".format(step_method.__class__))
return should_update
def _should_train(self, wait_for_full_episode=False):
"""
Determine if we should start a training phase according to the number of steps passed since the last training
:return: boolean: True if we should start a training phase
"""
step_method = self.ap.algorithm.num_consecutive_playing_steps
if step_method.__class__ == EnvironmentEpisodes:
should_update = (self.current_episode - self.last_training_phase_step) >= step_method.num_steps
if should_update:
self.last_training_phase_step = self.current_episode
elif step_method.__class__ == EnvironmentSteps:
should_update = (self.total_steps_counter - self.last_training_phase_step) >= step_method.num_steps
if wait_for_full_episode:
should_update = should_update and self.current_episode_steps_counter == 0
if should_update:
self.last_training_phase_step = self.total_steps_counter
else:
raise ValueError("The num_consecutive_playing_steps parameter should be either "
"EnvironmentSteps or Episodes. Instead it is {}".format(step_method.__class__))
return should_update
def train(self):
"""
Check if a training phase should be done as configured by num_consecutive_playing_steps.
If it should, then do several training steps as configured by num_consecutive_training_steps.
A single training iteration: Sample a batch, train on it and update target networks.
:return: The total training loss during the training iterations.
"""
loss = 0
if self._should_train():
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
# TODO: this should be network dependent
network_parameters = list(self.ap.network_wrappers.values())[0]
# update counters
self.training_iteration += 1
# sample a batch and train on it
batch = self.call_memory('sample', network_parameters.batch_size)
if self.pre_network_filter is not None:
batch = self.pre_network_filter.filter(batch, update_internal_state=False, deep_copy=False)
# if the batch returned empty then there are not enough samples in the replay buffer -> skip
# training step
if len(batch) > 0:
# train
batch = Batch(batch)
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
loss += total_loss
self.unclipped_grads.add_sample(unclipped_grads)
# TODO: the learning rate decay should be done through the network instead of here
# decay learning rate
if network_parameters.learning_rate_decay_rate != 0:
self.curr_learning_rate.add_sample(self.networks['main'].sess.run(
self.networks['main'].online_network.current_learning_rate))
else:
self.curr_learning_rate.add_sample(network_parameters.learning_rate)
if any([network.has_target for network in self.networks.values()]) \
and self._should_update_online_weights_to_target():
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
self.agent_logger.create_signal_value('Update Target Network', 1)
else:
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
self.loss.add_sample(loss)
if self.imitation:
self.log_to_screen()
# run additional commands after the training is done
self.post_training_commands()
return loss
def choose_action(self, curr_state):
"""
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
or testing.
:param curr_state: the current state to act upon.
:return: chosen action, some action value describing the action (q-value, probability, etc)
"""
pass
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]],
network_name: str):
"""
convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all
observations together, measurements together, etc.
"""
# convert to batch so we can run it through the network
states = force_list(states)
batches_dict = {}
for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys():
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
# addition to the current_state, so that all the inputs of the network will be filled)
if key in states[0].keys():
batches_dict[key] = np.array([np.array(state[key]) for state in states])
return batches_dict
def act(self) -> ActionInfo:
"""
Given the agents current knowledge, decide on the next action to apply to the environment
:return: an action and a dictionary containing any additional info from the action decision process
"""
if self.phase == RunPhase.TRAIN and self.ap.algorithm.num_consecutive_playing_steps.num_steps == 0:
# This agent never plays while training (e.g. behavioral cloning)
return None
# count steps (only when training or if we are in the evaluation worker)
if self.phase != RunPhase.TEST or self.ap.task_parameters.evaluate_only:
self.total_steps_counter += 1
self.current_episode_steps_counter += 1
# decide on the action
if self.phase == RunPhase.HEATUP and not self.ap.algorithm.heatup_using_network_decisions:
# random action
self.last_action_info = self.spaces.action.sample_with_info()
else:
# informed action
if self.pre_network_filter is not None:
# before choosing an action, first use the pre_network_filter to filter out the current state
curr_state = self.run_pre_network_filter_for_inference(self.curr_state)
else:
curr_state = self.curr_state
self.last_action_info = self.choose_action(curr_state)
filtered_action_info = self.output_filter.filter(self.last_action_info)
return filtered_action_info
def run_pre_network_filter_for_inference(self, state: StateType):
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
return self.pre_network_filter.filter(dummy_env_response)[0].next_state
def get_state_embedding(self, state: dict) -> np.ndarray:
"""
Given a state, get the corresponding state embedding from the main network
:param state: a state dict
:return: a numpy embedding vector
"""
# TODO: this won't work anymore
# TODO: instead of the state embedding (which contains the goal) we should use the observation embedding
embedding = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(state, "main"),
outputs=self.networks['main'].online_network.state_embedding)
return embedding
def update_transition_before_adding_to_replay_buffer(self, transition: Transition) -> Transition:
"""
Allows agents to update the transition just before adding it to the replay buffer.
Can be useful for agents that want to tweak the reward, termination signal, etc.
:param transition: the transition to update
:return: the updated transition
"""
return transition
def observe(self, env_response: EnvResponse) -> bool:
"""
Given a response from the environment, distill the observation from it and store it for later use.
The response should be a dictionary containing the performed action, the new observation and measurements,
the reward, a game over flag and any additional information necessary.
:param env_response: result of call from environment.step(action)
:return:
"""
# filter the env_response
filtered_env_response = self.input_filter.filter(env_response)[0]
# inject agent collected statistics, if required
if self.ap.algorithm.use_accumulated_reward_as_measurement:
if 'measurements' in filtered_env_response.next_state:
filtered_env_response.next_state['measurements'] = np.append(filtered_env_response.next_state['measurements'],
self.total_shaped_reward_in_current_episode)
else:
filtered_env_response.next_state['measurements'] = np.array([self.total_shaped_reward_in_current_episode])
# if we are in the first step in the episode, then we don't have a a next state and a reward and thus no
# transition yet, and therefore we don't need to store anything in the memory.
# also we did not reach the goal yet.
if self.current_episode_steps_counter == 0:
# initialize the current state
self.curr_state = filtered_env_response.next_state
return env_response.game_over
else:
transition = Transition(state=copy.copy(self.curr_state), action=self.last_action_info.action,
reward=filtered_env_response.reward, next_state=filtered_env_response.next_state,
game_over=filtered_env_response.game_over, info=filtered_env_response.info)
# now that we have formed a basic transition - the next state progresses to be the current state
self.curr_state = filtered_env_response.next_state
# make agent specific changes to the transition if needed
transition = self.update_transition_before_adding_to_replay_buffer(transition)
# merge the intrinsic reward in
if self.ap.algorithm.scale_external_reward_by_intrinsic_reward_value:
transition.reward = transition.reward * (1 + self.last_action_info.action_intrinsic_reward)
else:
transition.reward = transition.reward + self.last_action_info.action_intrinsic_reward
# sum up the total shaped reward
self.total_shaped_reward_in_current_episode += transition.reward
self.total_reward_in_current_episode += env_response.reward
self.shaped_reward.add_sample(transition.reward)
self.reward.add_sample(env_response.reward)
# add action info to transition
if type(self.parent).__name__ == 'CompositeAgent':
transition.add_info(self.parent.last_action_info.__dict__)
else:
transition.add_info(self.last_action_info.__dict__)
# create and store the transition
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP]:
# for episodic memories we keep the transitions in a local buffer until the episode is ended.
# for regular memories we insert the transitions directly to the memory
if isinstance(self.memory, EpisodicExperienceReplay):
self.current_episode_buffer.insert(transition)
else:
self.call_memory('store', transition)
if self.ap.visualization.dump_in_episode_signals:
self.update_step_in_episode_log()
return transition.game_over
def post_training_commands(self):
pass
def get_predictions(self, states: List[Dict[str, np.ndarray]], prediction_type: PredictionType):
"""
Get a prediction from the agent with regard to the requested prediction_type.
If the agent cannot predict this type of prediction_type, or if there is more than possible way to do so,
raise a ValueException.
:param states:
:param prediction_type:
:return:
"""
predictions = self.networks['main'].online_network.predict_with_prediction_type(
# states=self.dict_state_to_batches_dict(states, 'main'), prediction_type=prediction_type)
states=states, prediction_type=prediction_type)
if len(predictions.keys()) != 1:
raise ValueError("The network has more than one component {} matching the requested prediction_type {}. ".
format(list(predictions.keys()), prediction_type))
return list(predictions.values())[0]
def set_incoming_directive(self, action: ActionType) -> None:
if isinstance(self.in_action_space, GoalsSpace):
self.current_hrl_goal = action
elif isinstance(self.in_action_space, AttentionActionSpace):
self.input_filter.observation_filters['attention'].crop_low = action[0]
self.input_filter.observation_filters['attention'].crop_high = action[1]
self.output_filter.action_filters['masking'].set_masking(action[0], action[1])
def save_checkpoint(self, checkpoint_id: int) -> None:
"""
Allows agents to store additional information when saving checkpoints.
:param checkpoint_id: the id of the checkpoint
:return: None
"""
pass
def sync(self) -> None:
"""
Sync the global network parameters to local networks
:return: None
"""
for network in self.networks.values():
network.sync()

View File

@@ -0,0 +1,125 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union, List, Dict
import numpy as np
from rl_coach.core_types import EnvResponse, ActionInfo, RunPhase, PredictionType, ActionType
class AgentInterface(object):
def __init__(self):
self._phase = RunPhase.HEATUP
self._parent = None
self.spaces = None
@property
def parent(self):
"""
Get the parent class of the agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the agent
:param val: the new parent
:return: None
"""
self._parent = val
@property
def phase(self) -> RunPhase:
"""
Get the phase of the agent
:return: the current phase
"""
return self._phase
@phase.setter
def phase(self, val: RunPhase):
"""
Change the phase of the agent
:param val: the new phase
:return: None
"""
self._phase = val
def reset_internal_state(self) -> None:
"""
Reset the episode parameters for the agent
:return: None
"""
raise NotImplementedError("")
def train(self) -> Union[float, List]:
"""
Train the agents network
:return: The loss of the training
"""
raise NotImplementedError("")
def act(self) -> ActionInfo:
"""
Get a decision of the next action to take.
The action is dependent on the current state which the agent holds from resetting the environment or from
the observe function.
:return: A tuple containing the actual action and additional info on the action
"""
raise NotImplementedError("")
def observe(self, env_response: EnvResponse) -> bool:
"""
Gets a response from the environment.
Processes this information for later use. For example, create a transition and store it in memory.
The action info (a class containing any info the agent wants to store regarding its action decision process) is
stored by the agent itself when deciding on the action.
:param env_response: a EnvResponse containing the response from the environment
:return: a done signal which is based on the agent knowledge. This can be different from the done signal from
the environment. For example, an agent can decide to finish the episode each time it gets some
intrinsic reward
"""
raise NotImplementedError("")
def save_checkpoint(self, checkpoint_id: int) -> None:
"""
Save the model of the agent to the disk. This can contain the network parameters, the memory of the agent, etc.
:param checkpoint_id: the checkpoint id to use for saving
:return: None
"""
raise NotImplementedError("")
def get_predictions(self, states: Dict, prediction_type: PredictionType) -> np.ndarray:
"""
Get a prediction from the agent with regard to the requested prediction_type. If the agent cannot predict this
type of prediction_type, or if there is more than possible way to do so, raise a ValueException.
:param states:
:param prediction_type:
:return: the agent's prediction
"""
raise NotImplementedError("")
def set_incoming_directive(self, action: ActionType) -> None:
"""
Pass a higher level command (directive) to the agent.
For example, a higher level agent can set the goal of the agent.
:param action: the directive to pass to the agent
:return: None
"""
raise NotImplementedError("")

View File

@@ -0,0 +1,81 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.imitation_agent import ImitationAgent
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.base_parameters import AgentParameters, AlgorithmParameters, NetworkParameters, InputEmbedderParameters, \
MiddlewareScheme
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class BCAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.collect_new_data = False
class BCNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
class BCAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=BCAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": BCNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.bc_agent:BCAgent'
# Behavioral Cloning Agent
class BCAgent(ImitationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# When using a policy head, the targets refer to the advantages that we are normally feeding the head with.
# In this case, we need the policy head to just predict probabilities, so while we usually train the network
# with log(Pi)*Advantages, in this specific case we will train it to log(Pi), which after the softmax will
# predict Pi (=probabilities)
targets = np.ones(batch.actions().shape[0])
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_0_0': batch.actions()},
targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,84 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.exploration_policies.bootstrapped import BootstrappedParameters
class BootstrappedDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.num_output_head_copies = 10
self.rescale_gradient_from_head_by_factor = [1.0/self.num_output_head_copies]*self.num_output_head_copies
class BootstrappedDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.network_wrappers = {"main": BootstrappedDQNNetworkParameters()}
self.exploration = BootstrappedParameters()
@property
def path(self):
return 'rl_coach.agents.bootstrapped_dqn_agent:BootstrappedDQNAgent'
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
class BootstrappedDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def reset_internal_state(self):
super().reset_internal_state()
self.exploration_policy.select_head()
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
next_states_online_values = self.networks['main'].online_network.predict(batch.next_states(network_keys))
result = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
q_st_plus_1 = result[:self.ap.exploration.architecture_num_q_heads]
TD_targets = result[self.ap.exploration.architecture_num_q_heads:]
# initialize with the current prediction so that we will
# only update the action that we have actually done in this transition
for i in range(self.ap.network_wrappers['main'].batch_size):
mask = batch[i].info['mask']
for head_idx in range(self.ap.exploration.architecture_num_q_heads):
if mask[head_idx] == 1:
selected_action = np.argmax(next_states_online_values[head_idx][i], 0)
TD_targets[head_idx][i, batch.actions()[i]] = \
batch.rewards()[i] + (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount \
* q_st_plus_1[head_idx][i][selected_action]
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def observe(self, env_response):
mask = np.random.binomial(1, self.ap.exploration.bootstrapped_data_sharing_probability,
self.ap.exploration.architecture_num_q_heads)
env_response.info['mask'] = mask
return super().observe(env_response)

View File

@@ -0,0 +1,114 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNNetworkParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.categorical_q_head import CategoricalQHeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.core_types import StateType
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class CategoricalDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.heads_parameters = [CategoricalQHeadParameters()]
class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.v_min = -10.0
self.v_max = 10.0
self.atoms = 51
class CategoricalDQNExplorationParameters(EGreedyParameters):
def __init__(self):
super().__init__()
self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.evaluation_epsilon = 0.001
class CategoricalDQNAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
exploration=CategoricalDQNExplorationParameters(),
memory=ExperienceReplayParameters(),
networks={"main": CategoricalDQNNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.categorical_dqn_agent:CategoricalDQNAgent'
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
class CategoricalDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)
def distribution_prediction_to_q_values(self, prediction):
return np.dot(prediction, self.z_values)
# prediction's format is (batch,actions,atoms)
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
prediction = self.get_prediction(states)
q_values = self.distribution_prediction_to_q_values(prediction)
else:
q_values = None
return q_values
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the action we actually took, the error is calculated by the atoms distribution
# for all other actions, the error is 0
distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# only update the action that we have actually done in this transition
target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))
batches = np.arange(self.ap.network_wrappers['main'].batch_size)
for j in range(self.z_values.size):
tzj = np.fmax(np.fmin(batch.rewards() +
(1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
self.z_values[self.z_values.size - 1]),
self.z_values[0])
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
u = (np.ceil(bj)).astype(int)
l = (np.floor(bj)).astype(int)
m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
# total_loss = cross entropy between actual result above and predicted result for the given action
TD_targets[batches, batch.actions()] = m
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,277 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from collections import OrderedDict
from random import shuffle
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.core_types import EnvironmentSteps, Batch, EnvResponse, StateType
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.schedules import ConstantSchedule
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
from rl_coach.logger import screen
class ClippedPPONetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [VHeadParameters(), PPOHeadParameters()]
self.loss_weights = [1.0, 1.0]
self.rescale_gradient_from_head_by_factor = [1, 1]
self.batch_size = 64
self.optimizer_type = 'Adam'
self.clip_gradients = None
self.use_separate_networks_per_head = True
self.async_training = False
self.l2_regularization = 0
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
class ClippedPPOAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_episodes_in_experience_replay = 1000000
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
self.gae_lambda = 0.95
self.use_kl_regularization = False
self.clip_likelihood_ratio_using_epsilon = 0.2
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.beta_entropy = 0.01 # should be 0 for mujoco
self.num_consecutive_playing_steps = EnvironmentSteps(2048)
self.optimization_epochs = 10
self.normalization_stats = None
self.clipping_decay_schedule = ConstantSchedule(1)
class ClippedPPOAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=ClippedPPOAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": ClippedPPONetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.clipped_ppo_agent:ClippedPPOAgent'
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
class ClippedPPOAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = self.register_signal('Grads (unclipped)')
self.value_targets = self.register_signal('Value Targets')
self.kl_divergence = self.register_signal('KL Divergence')
self.likelihood_ratio = self.register_signal('Likelihood Ratio')
self.clipped_likelihood_ratio = self.register_signal('Clipped Likelihood Ratio')
def set_session(self, sess):
super().set_session(sess)
if self.ap.algorithm.normalization_stats is not None:
self.ap.algorithm.normalization_stats.set_session(sess)
def fill_advantages(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
current_state_values = self.networks['main'].online_network.predict(batch.states(network_keys))[0]
current_state_values = current_state_values.squeeze()
self.state_values.add_sample(current_state_values)
# calculate advantages
advantages = []
value_targets = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
advantages = batch.total_returns() - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
value_targets = np.array([])
for idx, game_over in enumerate(batch.game_overs()):
if game_over:
# get advantages for the rollout
value_bootstrapping = np.zeros((1,))
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
rollout_advantages, gae_based_value_targets = \
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
rollout_state_values)
episode_start_idx = idx + 1
advantages = np.append(advantages, rollout_advantages)
value_targets = np.append(value_targets, gae_based_value_targets)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
for transition, advantage, value_target in zip(batch.transitions, advantages, value_targets):
transition.info['advantage'] = advantage
transition.info['gae_based_value_target'] = value_target
self.action_advantages.add_sample(advantages)
def train_network(self, batch, epochs):
batch_results = []
for j in range(epochs):
batch.shuffle()
batch_results = {
'total_loss': [],
'losses': [],
'unclipped_grads': [],
'kl_divergence': [],
'entropy': []
}
fetches = [self.networks['main'].online_network.output_heads[1].kl_divergence,
self.networks['main'].online_network.output_heads[1].entropy,
self.networks['main'].online_network.output_heads[1].likelihood_ratio,
self.networks['main'].online_network.output_heads[1].clipped_likelihood_ratio]
for i in range(int(batch.size / self.ap.network_wrappers['main'].batch_size)):
start = i * self.ap.network_wrappers['main'].batch_size
end = (i + 1) * self.ap.network_wrappers['main'].batch_size
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
actions = batch.actions()[start:end]
gae_based_value_targets = batch.info('gae_based_value_target')[start:end]
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
actions = np.expand_dims(actions, -1)
# get old policy probabilities and distribution
# TODO-perf - the target network ("old_policy") is not changing. this can be calculated once for all epochs.
# the shuffling being done, should only be performed on the indices.
result = self.networks['main'].target_network.predict({k: v[start:end] for k, v in batch.states(network_keys).items()})
old_policy_distribution = result[1:]
# calculate gradients and apply on both the local policy network and on the global policy network
if self.ap.algorithm.estimate_state_value_using_gae:
value_targets = np.expand_dims(gae_based_value_targets, -1)
else:
value_targets = batch.total_returns(expand_dims=True)[start:end]
inputs = copy.copy({k: v[start:end] for k, v in batch.states(network_keys).items()})
inputs['output_1_0'] = actions
# The old_policy_distribution needs to be represented as a list, because in the event of
# discrete controls, it has just a mean. otherwise, it has both a mean and standard deviation
for input_index, input in enumerate(old_policy_distribution):
inputs['output_1_{}'.format(input_index + 1)] = input
inputs['output_1_3'] = self.ap.algorithm.clipping_decay_schedule.current_value
total_loss, losses, unclipped_grads, fetch_result = \
self.networks['main'].train_and_sync_networks(
inputs, [value_targets, batch.info('advantage')[start:end]], additional_fetches=fetches
)
batch_results['total_loss'].append(total_loss)
batch_results['losses'].append(losses)
batch_results['unclipped_grads'].append(unclipped_grads)
batch_results['kl_divergence'].append(fetch_result[0])
batch_results['entropy'].append(fetch_result[1])
self.unclipped_grads.add_sample(unclipped_grads)
self.value_targets.add_sample(value_targets)
self.likelihood_ratio.add_sample(fetch_result[2])
self.clipped_likelihood_ratio.add_sample(fetch_result[3])
for key in batch_results.keys():
batch_results[key] = np.mean(batch_results[key], 0)
self.value_loss.add_sample(batch_results['losses'][0])
self.policy_loss.add_sample(batch_results['losses'][1])
if self.ap.network_wrappers['main'].learning_rate_decay_rate != 0:
curr_learning_rate = self.networks['main'].online_network.get_variable_value(
self.networks['main'].online_network.adaptive_learning_rate_scheme)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.ap.network_wrappers['main'].learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
("Surrogate loss", batch_results['losses'][1]),
("KL divergence", batch_results['kl_divergence']),
("Entropy", batch_results['entropy']),
("training epoch", j),
("learning_rate", curr_learning_rate)
]),
prefix="Policy training"
)
self.total_kl_divergence_during_training_process = batch_results['kl_divergence']
self.entropy.add_sample(batch_results['entropy'])
self.kl_divergence.add_sample(batch_results['kl_divergence'])
return batch_results['losses']
def post_training_commands(self):
# clean memory
self.call_memory('clean')
def train(self):
if self._should_train(wait_for_full_episode=True):
dataset = self.memory.transitions
dataset = self.pre_network_filter.filter(dataset, deep_copy=False)
batch = Batch(dataset)
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
self.networks['main'].sync()
self.fill_advantages(batch)
# take only the requested number of steps
if isinstance(self.ap.algorithm.num_consecutive_playing_steps, EnvironmentSteps):
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
shuffle(dataset)
batch = Batch(dataset)
self.train_network(batch, self.ap.algorithm.optimization_epochs)
self.post_training_commands()
self.training_iteration += 1
# self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return None
def run_pre_network_filter_for_inference(self, state: StateType):
dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False)
return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state
def choose_action(self, curr_state):
self.ap.algorithm.clipping_decay_schedule.step()
return super().choose_action(curr_state)

View File

@@ -0,0 +1,415 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
import itertools
from enum import Enum
from typing import Union, List, Dict
import numpy as np
from rl_coach.agents.agent_interface import AgentInterface
from rl_coach.base_parameters import AgentParameters, VisualizationParameters
# from rl_coach.environments.environment_interface import ActionSpace
from rl_coach.spaces import ActionSpace
from rl_coach.spaces import AgentSelection, AttentionActionSpace, ObservationSpace, SpacesDefinition
from rl_coach.utils import short_dynamic_import
from rl_coach.core_types import ActionInfo, EnvResponse, ActionType, RunPhase
from rl_coach.filters.observation.observation_crop_filter import ObservationCropFilter
class DecisionPolicy(object):
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, decide on a single action to take.
:param actions_info: a dictionary of agent names and their corresponding
ActionInfo instances containing information for each agents action
:return: a single action and the corresponding action info
"""
raise NotImplementedError("")
class SingleDecider(DecisionPolicy):
"""
A decision policy that chooses the action according to the agent that is currently in control.
"""
def __init__(self, default_decision_maker: str):
super().__init__()
self._decision_maker = default_decision_maker
@property
def decision_maker(self):
"""
Get the decision maker that was set by the upper level control.
"""
return self._decision_maker
@decision_maker.setter
def decision_maker(self, decision_maker: str):
"""
Set the decision maker by the upper level control.
:param action: the incoming action from the upper level control.
"""
self._decision_maker = decision_maker
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action of the current decision maker
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
if self.decision_maker not in actions_info.keys():
raise ValueError("The current decision maker ({}) does not exist in the given actions ({})"
.format(self.decision_maker, actions_info.keys()))
return actions_info[self.decision_maker]
class RoundRobin(DecisionPolicy):
"""
A decision policy that chooses the action according to agents selected in a circular order.
"""
def __init__(self, num_agents: int):
super().__init__()
self.round_robin = itertools.cycle(range(num_agents))
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action of the current decision maker, which is set in a
circular order
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
decision_maker = self.round_robin.__next__()
if decision_maker not in range(len(actions_info.keys())):
raise ValueError("The size of action_info does not match the number of agents set to RoundRobin decision"
" policy.")
return actions_info.items()[decision_maker]
class MajorityVote(DecisionPolicy):
"""
A decision policy that chooses the action that most of the agents chose.
This policy is only useful for discrete control.
"""
def __init__(self):
super().__init__()
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the action that most agents agree on
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
# TODO: enforce discrete action spaces
if len(actions_info.keys()) == 0:
raise ValueError("The given list of actions is empty")
vote_count = np.bincount([action_info.action for action_info in actions_info.values()])
majority_vote = np.argmax(vote_count)
return actions_info.items()[majority_vote]
class MeanDecision(DecisionPolicy):
"""
A decision policy that takes the mean action given the actions of all the agents.
This policy is only useful for continuous control.
"""
def __init__(self):
super().__init__()
def choose_action(self, actions_info: Dict[str, ActionInfo]) -> ActionInfo:
"""
Given a list of actions from multiple agents, take the mean action
:param actions_info: a list of ActionInfo instances containing the information for each agents action
:return: a single action
"""
# TODO: enforce continuous action spaces
if len(actions_info.keys()) == 0:
raise ValueError("The given list of actions is empty")
mean = np.mean([action_info.action for action_info in actions_info.values()], axis=0)
return ActionInfo(mean)
class RewardPolicy(Enum):
ReachingGoal = 0
NativeEnvironmentReward = 1
AccumulatedEnvironmentRewards = 2
class CompositeAgent(AgentInterface):
"""
A CompositeAgent is a group of agents in the same hierarchy level.
In a CompositeAgent, each agent may take the role of either a controller or an observer.
Each agent that is defined as observer, gets observations from the environment.
Each agent that is defined as controller, can potentially also control the environment, in addition to observing it.
There are several ways to decide on the action from different controller agents:
1. Ensemble -
- Take the majority vote (discrete controls)
- Take the mean action (continuous controls)
- Round robin between the agents (discrete/continuous)
2. Skills -
- At each step a single agent decides (Chosen by the uppoer hierarchy controlling agent)
A CompositeAgent can be controlled using one of the following methods (ActionSpaces):
1. Goals (in terms of measurements, observation, embedding or a change in those values)
2. Agent Selection (skills) / Discrete action space.
3. Attention (a subset of the real environment observation / action space)
"""
def __init__(self,
agents_parameters: Union[AgentParameters, Dict[str, AgentParameters]],
visualization_parameters: VisualizationParameters,
decision_policy: DecisionPolicy,
out_action_space: ActionSpace,
in_action_space: Union[None, ActionSpace]=None,
decision_makers: Union[bool, Dict[str, bool]]=True,
reward_policy: RewardPolicy=RewardPolicy.NativeEnvironmentReward,
name="CompositeAgent"):
"""
Construct an agent group
:param agents_parameters: a list of presets describing each one of the agents in the group
:param decision_policy: the decision policy of the group which describes how actions are consolidated
:param out_action_space: the type of action space that is used by this composite agent in order to control the
underlying environment
:param in_action_space: the type of action space that is used by the upper level agent in order to control this
group
:param decision_makers: a list of booleans representing for each corresponding agent if it has a decision
privilege or if it is just an observer
:param reward_policy: the type of the reward that the group receives
"""
super().__init__()
if isinstance(agents_parameters, AgentParameters):
decision_makers = {agents_parameters.name: True}
agents_parameters = {agents_parameters.name: agents_parameters}
self.agents_parameters = agents_parameters
self.visualization_parameters = visualization_parameters
self.decision_makers = decision_makers
self.decision_policy = decision_policy
self.in_action_space = in_action_space
self.out_action_space = out_action_space # TODO: this is not being used
self.reward_policy = reward_policy
self.full_name_id = self.name = name
self.current_decision_maker = 0
self.environment = None
self.agents = {} # key = agent_name, value = agent
self.incoming_action = None
self.last_state = None
self._phase = RunPhase.HEATUP
self.last_action_info = None
self.current_episode = 0
self.parent_level_manager = None
# environment spaces
self.spaces = None
# counters for logging
self.total_steps_counter = 0
self.current_episode_steps_counter = 0
self.total_reward_in_current_episode = 0
# validate input
if set(self.decision_makers) != set(self.agents_parameters):
raise ValueError("The decision_makers dictionary keys does not match the names of the given agents")
if sum(self.decision_makers.values()) > 1 and type(self.decision_policy) == SingleDecider \
and type(self.in_action_space) != AgentSelection:
raise ValueError("When the control policy is set to single decider, the master policy should control the"
"agent group via agent selection (ControlType.AgentSelection)")
@property
def parent(self):
"""
Get the parent class of the composite agent
:return: the current phase
"""
return self._parent
@parent.setter
def parent(self, val):
"""
Change the parent class of the composite agent.
Additionally, updates the full name of the agent
:param val: the new parent
:return: None
"""
self._parent = val
if not hasattr(self._parent, 'name'):
raise ValueError("The parent of a composite agent must have a name")
self.full_name_id = "{}/{}".format(self._parent.name, self.name)
def create_agents(self):
for agent_name, agent_parameters in self.agents_parameters.items():
agent_parameters.name = agent_name
# create agent
self.agents[agent_parameters.name] = short_dynamic_import(agent_parameters.path)(agent_parameters,
parent=self)
self.agents[agent_parameters.name].parent_level_manager = self.parent_level_manager
# TODO: this is a bit too specific to be defined here
# add an attention cropping filter if the incoming directives are attention boxes
if isinstance(self.in_action_space, AttentionActionSpace):
attention_size = self.in_action_space.forced_attention_size
for agent in self.agents.values():
agent.input_filter.observation_filters['attention'] = \
ObservationCropFilter(crop_low=np.zeros_like(attention_size), crop_high=attention_size)
agent.input_filter.observation_filters.move_to_end('attention', last=False) # add the cropping at the beginning
def setup_logger(self) -> None:
"""
Setup the logger for all the agents in the composite agent
:return: None
"""
[agent.setup_logger() for agent in self.agents.values()]
def set_session(self, sess) -> None:
"""
Set the deep learning framework session for all the agents in the composite agent
:return: None
"""
[agent.set_session(sess) for agent in self.agents.values()]
def set_environment_parameters(self, spaces: SpacesDefinition):
"""
Sets the parameters that are environment dependent. As a side effect, initializes all the components that are
dependent on those values, by calling init_environment_dependent_modules
:param spaces: the definitions of all the spaces of the environment
:return: None
"""
self.spaces = copy.deepcopy(spaces)
[agent.set_environment_parameters(self.spaces) for agent in self.agents.values()]
@property
def phase(self):
return self._phase
@phase.setter
def phase(self, val: RunPhase) -> None:
"""
Change the current phase of all the agents in the group
:param phase: the new phase
:return: None
"""
self._phase = val
for agent in self.agents.values():
agent.phase = val
def end_episode(self) -> None:
"""
End an episode
:return: None
"""
self.current_episode += 1
[agent.handle_episode_ended() for agent in self.agents.values()]
def reset_internal_state(self) -> None:
"""
Reset the episode for all the agents in the group
:return: None
"""
# update counters
self.total_steps_counter = 0
self.current_episode_steps_counter = 0
self.total_reward_in_current_episode = 0
# reset all sub modules
[agent.reset_internal_state() for agent in self.agents.values()]
def train(self) -> Union[float, List]:
"""
Make a single training step for all the agents of the group
:return: a list of loss values from the training step
"""
return [agent.train() for agent in self.agents.values()]
def act(self) -> ActionInfo:
"""
Get the actions from all the agents in the group. Then use the decision policy in order to
extract a single action out of the list of actions.
:return: the chosen action and its corresponding information
"""
# update counters
self.total_steps_counter += 1
self.current_episode_steps_counter += 1
# get the actions info from all the agents
actions_info = {}
for agent_name, agent in self.agents.items():
action_info = agent.act()
actions_info[agent_name] = action_info
# decide on a single action to apply to the environment
action_info = self.decision_policy.choose_action(actions_info)
# TODO: make the last action info a property?
# pass the action info to all the observers
for agent_name, is_decision_maker in self.decision_makers.items():
if not is_decision_maker:
self.agents[agent_name].last_action_info = action_info
self.last_action_info = action_info
return self.last_action_info
def observe(self, env_response: EnvResponse) -> bool:
"""
Given a response from the environment as a env_response, filter it and pass it to the agents.
This method has two main jobs:
1. Wrap the previous transition, ending with the new observation coming from EnvResponse.
2. Save the next_state as the current_state to take action upon for the next call to act().
:param env_response:
:param action_info: additional info about the chosen action
:return:
"""
# accumulate the unfiltered rewards for visualization
self.total_reward_in_current_episode += env_response.reward
episode_ended = env_response.game_over
# pass the env_response to all the sub-agents
# TODO: what if one agent decides to end the episode but the others don't? who decides?
for agent_name, agent in self.agents.items():
goal_reached = agent.observe(env_response)
episode_ended = episode_ended or goal_reached
# TODO: unlike for a single agent, here we also treat a game over by the environment.
# probably better to only return the agents' goal_reached decisions.
return episode_ended
def save_checkpoint(self, checkpoint_id: int) -> None:
[agent.save_checkpoint(checkpoint_id) for agent in self.agents.values()]
def set_incoming_directive(self, action: ActionType) -> None:
self.incoming_action = action
if isinstance(self.decision_policy, SingleDecider) and isinstance(self.in_action_space, AgentSelection):
self.decision_policy.decision_maker = list(self.agents.keys())[action]
if isinstance(self.in_action_space, AttentionActionSpace):
# TODO: redesign to be more modular
for agent in self.agents.values():
agent.input_filter.observation_filters['attention'].crop_low = action[0]
agent.input_filter.observation_filters['attention'].crop_high = action[1]
agent.output_filter.action_filters['masking'].set_masking(action[0], action[1])
# TODO rethink this scheme. we don't want so many if else clauses lying around here.  
# TODO - for incoming actions which do not involve setting the acting agent we should change the
# observation_space, goal to pursue, etc accordingly to the incoming action.
def sync(self) -> None:
"""
Sync the agent networks with the global network
:return:
"""
[agent.sync() for agent in self.agents.values()]

View File

@@ -0,0 +1,192 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.agent import Agent
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
AgentParameters, InputEmbedderParameters, EmbedderScheme
from rl_coach.exploration_policies.ou_process import OUProcessParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import BoxActionSpace, GoalsSpace
from rl_coach.architectures.tensorflow_components.heads.ddpg_actor_head import DDPGActorHeadParameters
from rl_coach.core_types import ActionInfo, EnvironmentSteps
class DDPGCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True),
'action': InputEmbedderParameters(scheme=EmbedderScheme.Shallow)}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [VHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class DDPGActorNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(batchnorm=True)}
self.middleware_parameters = FCMiddlewareParameters(batchnorm=True)
self.heads_parameters = [DDPGActorHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
self.batch_size = 64
self.async_training = False
self.learning_rate = 0.0001
self.create_target_network = True
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = False
class DDPGAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
self.rate_for_copying_weights_to_target = 0.001
self.num_consecutive_playing_steps = EnvironmentSteps(1)
self.use_target_network_for_evaluation = False
self.action_penalty = 0
self.clip_critic_targets = None # expected to be a tuple of the form (min_clip_value, max_clip_value) or None
self.use_non_zero_discount_for_terminal_states = False
class DDPGAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DDPGAlgorithmParameters(),
exploration=OUProcessParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"actor": DDPGActorNetworkParameters(),
"critic": DDPGCriticNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.ddpg_agent:DDPGAgent'
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
class DDPGAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.q_values = self.register_signal("Q")
self.TD_targets_signal = self.register_signal("TD targets")
self.action_signal = self.register_signal("actions")
def learn_from_batch(self, batch):
actor = self.networks['actor']
critic = self.networks['critic']
actor_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
critic_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# TD error = r + discount*max(q_st_plus_1) - q_st
next_actions, actions_mean = actor.parallel_prediction([
(actor.target_network, batch.next_states(actor_keys)),
(actor.online_network, batch.states(actor_keys))
])
critic_inputs = copy.copy(batch.next_states(critic_keys))
critic_inputs['action'] = next_actions
q_st_plus_1 = critic.target_network.predict(critic_inputs)
# calculate the bootstrapped TD targets while discounting terminal states according to
# use_non_zero_discount_for_terminal_states
if self.ap.algorithm.use_non_zero_discount_for_terminal_states:
TD_targets = batch.rewards(expand_dims=True) + self.ap.algorithm.discount * q_st_plus_1
else:
TD_targets = batch.rewards(expand_dims=True) + \
(1.0 - batch.game_overs(expand_dims=True)) * self.ap.algorithm.discount * q_st_plus_1
# clip the TD targets to prevent overestimation errors
if self.ap.algorithm.clip_critic_targets:
TD_targets = np.clip(TD_targets, *self.ap.algorithm.clip_critic_targets)
self.TD_targets_signal.add_sample(TD_targets)
# get the gradients of the critic output with respect to the action
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = actions_mean
action_gradients = critic.online_network.predict(critic_inputs,
outputs=critic.online_network.gradients_wrt_inputs[0]['action'])
# train the critic
critic_inputs = copy.copy(batch.states(critic_keys))
critic_inputs['action'] = batch.actions(len(batch.actions().shape) == 1)
result = critic.train_and_sync_networks(critic_inputs, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
# apply the gradients from the critic to the actor
initial_feed_dict = {actor.online_network.gradients_weights_ph[0]: -action_gradients}
gradients = actor.online_network.predict(batch.states(actor_keys),
outputs=actor.online_network.weighted_gradients[0],
initial_feed_dict=initial_feed_dict)
if actor.has_global:
actor.apply_gradients_to_global_network(gradients)
actor.update_online_network()
else:
actor.apply_gradients_to_online_network(gradients)
return total_loss, losses, unclipped_grads
def train(self):
return Agent.train(self)
def choose_action(self, curr_state):
if not (isinstance(self.spaces.action, BoxActionSpace) or isinstance(self.spaces.action, GoalsSpace)):
raise ValueError("DDPG works only for continuous control problems")
# convert to batch so we can run it through the network
tf_input_state = self.prepare_batch_for_inference(curr_state, 'actor')
if self.ap.algorithm.use_target_network_for_evaluation:
actor_network = self.networks['actor'].target_network
else:
actor_network = self.networks['actor'].online_network
action_values = actor_network.predict(tf_input_state).squeeze()
action = self.exploration_policy.get_action(action_values)
self.action_signal.add_sample(action)
# get q value
tf_input_state = self.prepare_batch_for_inference(curr_state, 'critic')
action_batch = np.expand_dims(action, 0)
if type(action) != np.ndarray:
action_batch = np.array([[action]])
tf_input_state['action'] = action_batch
q_value = self.networks['critic'].online_network.predict(tf_input_state)[0]
self.q_values.add_sample(q_value)
action_info = ActionInfo(action=action,
action_value=q_value)
return action_info

View File

@@ -0,0 +1,69 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.schedules import LinearSchedule
from rl_coach.agents.dqn_agent import DQNAgentParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.core_types import EnvironmentSteps
class DDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000)
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.exploration.evaluation_epsilon = 0.001
@property
def path(self):
return 'rl_coach.agents.ddqn_agent:DDQNAgent'
# Double DQN - https://arxiv.org/abs/1509.06461
class DDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# initialize with the current prediction so that we will
# only update the action that we have actually done in this transition
TD_errors = []
for i in range(self.ap.network_wrappers['main'].batch_size):
new_target = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * q_st_plus_1[i][selected_actions[i]]
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
TD_targets[i, batch.actions()[i]] = new_target
# update errors in prioritized replay buffer
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
importance_weights=importance_weights)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,219 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from enum import Enum
from typing import Union
import numpy as np
from rl_coach.agents.agent import Agent
from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
from rl_coach.architectures.tensorflow_components.heads.measurements_prediction_head import MeasurementsPredictionHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
InputEmbedderParameters, MiddlewareScheme
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace
from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class HandlingTargetsAfterEpisodeEnd(Enum):
LastStep = 0
NAN = 1
class DFPNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
'goal': InputEmbedderParameters(activation_function='leaky_relu')}
self.input_embedders_parameters['observation'].scheme = [
Conv2d([32, 8, 4]),
Conv2d([64, 4, 2]),
Conv2d([64, 3, 1]),
Dense([512]),
]
self.input_embedders_parameters['measurements'].scheme = [
Dense([128]),
Dense([128]),
Dense([128]),
]
self.input_embedders_parameters['goal'].scheme = [
Dense([128]),
Dense([128]),
Dense([128]),
]
self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
scheme=MiddlewareScheme.Empty)
self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
self.loss_weights = [1.0]
self.async_training = False
self.batch_size = 64
self.adam_optimizer_beta1 = 0.95
class DFPMemoryParameters(EpisodicExperienceReplayParameters):
def __init__(self):
self.max_size = (MemoryGranularity.Transitions, 20000)
self.shared_memory = True
super().__init__()
class DFPAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_predicted_steps_ahead = 6
self.goal_vector = [1.0, 1.0]
self.future_measurements_weights = [0.5, 0.5, 1.0]
self.use_accumulated_reward_as_measurement = False
self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
self.scale_measurements_targets = {}
self.num_consecutive_playing_steps = EnvironmentSteps(8)
class DFPAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DFPAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=DFPMemoryParameters(),
networks={"main": DFPNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.dfp_agent:DFPAgent'
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
class DFPAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.current_goal = self.ap.algorithm.goal_vector
self.target_measurements_scale_factors = None
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
network_inputs = batch.states(network_keys)
network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
self.ap.network_wrappers['main'].batch_size, axis=0)
# get the current outputs of the network
targets = self.networks['main'].online_network.predict(network_inputs)
# change the targets for the taken actions
for i in range(self.ap.network_wrappers['main'].batch_size):
targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()
result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def choose_action(self, curr_state):
if self.exploration_policy.requires_action_values():
# predict the future measurements
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
action_values = np.zeros(len(self.spaces.action.actions))
num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)
# calculate the score of each action by multiplying it's future measurements with the goal vector
for action_idx in range(len(self.spaces.action.actions)):
action_measurements = measurements_future_prediction[action_idx]
action_measurements = np.reshape(action_measurements,
(self.ap.algorithm.num_predicted_steps_ahead,
self.spaces.state['measurements'].shape[0]))
future_steps_values = np.dot(action_measurements, self.current_goal)
action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
self.ap.algorithm.future_measurements_weights)
else:
action_values = None
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
action = self.exploration_policy.get_action(action_values)
if action_values is not None:
action_values = action_values.squeeze()
action_info = ActionInfo(action=action, action_value=action_values[action])
else:
action_info = ActionInfo(action=action)
return action_info
def set_environment_parameters(self, spaces: SpacesDefinition):
self.spaces = copy.deepcopy(spaces)
self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
measurements_names=
self.spaces.state['measurements'].measurements_names)
# if the user has filled some scale values, check that he got the names right
if set(self.spaces.state['measurements'].measurements_names).intersection(
self.ap.algorithm.scale_measurements_targets.keys()) !=\
set(self.ap.algorithm.scale_measurements_targets.keys()):
raise ValueError("Some of the keys in parameter scale_measurements_targets ({}) are not defined in "
"the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
self.spaces.state['measurements'].measurements_names))
super().set_environment_parameters(self.spaces)
# the below is done after calling the base class method, as it might add accumulated reward as a measurement
# fill out the missing measurements scale factors
for measurement_name in self.spaces.state['measurements'].measurements_names:
if measurement_name not in self.ap.algorithm.scale_measurements_targets:
self.ap.algorithm.scale_measurements_targets[measurement_name] = 1
self.target_measurements_scale_factors = \
np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
self.spaces.state['measurements'].measurements_names])
def handle_episode_ended(self):
last_episode = self.current_episode_buffer
if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
self._update_measurements_targets(last_episode,
self.ap.algorithm.num_predicted_steps_ahead)
super().handle_episode_ended()
def _update_measurements_targets(self, episode, num_steps):
if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
raise ValueError("Measurements are not present in the transitions of the last episode played. ")
measurements_size = self.spaces.state['measurements'].shape[0]
for transition_idx, transition in enumerate(episode.transitions):
transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
for step in range(num_steps):
offset_idx = transition_idx + 2 ** step
if offset_idx >= episode.length():
if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
# the special MSE loss will ignore those entries so that the gradient will be 0 for these
transition.info['future_measurements'][step] = np.nan
continue
elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
offset_idx = - 1
transition.info['future_measurements'][step] = \
self.target_measurements_scale_factors * \
(episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])

View File

@@ -0,0 +1,99 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
InputEmbedderParameters, MiddlewareScheme
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.core_types import EnvironmentSteps
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
class DQNAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
self.num_consecutive_playing_steps = EnvironmentSteps(4)
self.discount = 0.99
class DQNNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
self.heads_parameters = [QHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = True
self.create_target_network = True
class DQNAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=DQNAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=ExperienceReplayParameters(),
networks={"main": DQNNetworkParameters()})
self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000)
self.exploration.evaluation_epsilon = 0.05
@property
def path(self):
return 'rl_coach.agents.dqn_agent:DQNAgent'
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
class DQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the action we actually took, the error is:
# TD error = r + discount*max(q_st_plus_1) - q_st
# # for all other actions, the error is 0
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# only update the action that we have actually done in this transition
TD_errors = []
for i in range(self.ap.network_wrappers['main'].batch_size):
new_target = batch.rewards()[i] +\
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
TD_errors.append(np.abs(new_target - TD_targets[i, batch.actions()[i]]))
TD_targets[i, batch.actions()[i]] = new_target
# update errors in prioritized replay buffer
importance_weights = self.update_transition_priorities_and_get_weights(TD_errors, batch)
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
importance_weights=importance_weights)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,108 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
import copy
from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters
from rl_coach.core_types import RunPhase
from rl_coach.spaces import SpacesDefinition
class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):
def __init__(self):
super().__init__()
self.time_limit = 40
self.sub_goal_testing_rate = 0.5
class HACDDPGAgentParameters(DDPGAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = HACDDPGAlgorithmParameters()
@property
def path(self):
return 'rl_coach.agents.hac_ddpg_agent:HACDDPGAgent'
# Hierarchical Actor Critic Generating Subgoals DDPG Agent - https://arxiv.org/pdf/1712.00948.pdf
class HACDDPGAgent(DDPGAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate
self.graph_manager = None
def choose_action(self, curr_state):
# top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal
# testing phase
graph_manager = self.parent_level_manager.parent_graph_manager
if self.ap.is_a_highest_level_agent:
graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate
if self.phase == RunPhase.TRAIN:
if graph_manager.should_test_current_sub_goal:
self.exploration_policy.change_phase(RunPhase.TEST)
else:
self.exploration_policy.change_phase(self.phase)
action_info = super().choose_action(curr_state)
return action_info
def update_transition_before_adding_to_replay_buffer(self, transition):
graph_manager = self.parent_level_manager.parent_graph_manager
# deal with goals given from a higher level agent
if not self.ap.is_a_highest_level_agent:
transition.state['desired_goal'] = self.current_hrl_goal
transition.next_state['desired_goal'] = self.current_hrl_goal
# TODO: allow setting goals which are not part of the state. e.g. state-embedding using get_prediction
self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(
self.current_hrl_goal, transition.next_state))
goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
self.current_hrl_goal, transition.next_state)
transition.reward = goal_reward
transition.game_over = transition.game_over or sub_goal_reached
# each level tests its own generated sub goals
if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:
#TODO-fixme
# _, sub_goal_reached = self.parent_level_manager.environment.agents['agent_1'].spaces.goal.\
# get_reward_for_goal_and_state(transition.action, transition.next_state)
_, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(
transition.action, transition.next_state)
sub_goal_is_missed = not sub_goal_reached
if sub_goal_is_missed:
transition.reward = -self.ap.algorithm.time_limit
return transition
def set_environment_parameters(self, spaces: SpacesDefinition):
super().set_environment_parameters(spaces)
if self.ap.is_a_highest_level_agent:
# the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have
# their GoalsSpace set to the in_action_space in agent.set_environment_parameters()
self.spaces.goal = self.spaces.action
self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])
if not self.ap.is_a_highest_level_agent:
self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward

View File

@@ -0,0 +1,115 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
from collections import OrderedDict
from typing import Union
import pygame
from rl_coach.agents.agent import Agent
from rl_coach.agents.bc_agent import BCNetworkParameters
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, InputEmbedderParameters, EmbedderScheme, \
AgentParameters
from rl_coach.core_types import ActionInfo
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from pandas import to_pickle
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.logger import screen
class HumanAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
class HumanNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
class HumanAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=HumanAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": BCNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.human_agent:HumanAgent'
class HumanAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.clock = pygame.time.Clock()
self.max_fps = int(self.ap.visualization.max_fps_for_human_control)
self.env = None
def init_environment_dependent_modules(self):
super().init_environment_dependent_modules()
self.env = self.parent_level_manager._real_environment
screen.log_title("Human Control Mode")
available_keys = self.env.get_available_keys()
if available_keys:
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
screen.log("")
for action, key in self.env.get_available_keys():
screen.log("\t- {}: {}".format(action, key))
screen.separator()
def train(self):
return 0
def choose_action(self, curr_state):
action = ActionInfo(self.env.get_action_from_user(), action_value=0)
action = self.output_filter.reverse_filter(action)
# keep constant fps
self.clock.tick(self.max_fps)
if not self.env.renderer.is_open:
self.save_replay_buffer_and_exit()
return action
def save_replay_buffer_and_exit(self):
replay_buffer_path = os.path.join(self.agent_logger.experiments_path, 'replay_buffer.p')
self.memory.tp = None
to_pickle(self.memory, replay_buffer_path)
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
exit()
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Episode"] = self.current_episode
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
screen.log_dict(log, prefix="Recording")

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from typing import Union
from rl_coach.core_types import RunPhase, ActionInfo
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.agents.agent import Agent
from rl_coach.logger import screen
## This is an abstract agent - there is no learn_from_batch method ##
# Imitation Agent
class ImitationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.imitation = True
def extract_action_values(self, prediction):
return prediction.squeeze()
def choose_action(self, curr_state):
# convert to batch so we can run it through the network
prediction = self.networks['main'].online_network.predict(self.prepare_batch_for_inference(curr_state, 'main'))
# get action values and extract the best action from it
action_values = self.extract_action_values(prediction)
if type(self.spaces.action) == DiscreteActionSpace:
# DISCRETE
self.exploration_policy.phase = RunPhase.TEST
action = self.exploration_policy.get_action(action_values)
action_info = ActionInfo(action=action,
action_probability=action_values[action])
else:
# CONTINUOUS
action = action_values
action_info = ActionInfo(action=action)
return action_info
def log_to_screen(self):
# log to screen
if self.phase == RunPhase.TRAIN:
# for the training phase - we log during the episode to visualize the progress in training
log = OrderedDict()
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Loss"] = self.loss.values[-1]
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix="Training")
else:
# for the evaluation phase - logging as in regular RL
super().log_to_screen()
def learn_from_batch(self, batch):
raise NotImplementedError("ImitationAgent is an abstract agent. Not to be used directly.")

View File

@@ -0,0 +1,72 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
class MixedMonteCarloAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.monte_carlo_mixing_rate = 0.1
class MixedMonteCarloAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = MixedMonteCarloAlgorithmParameters()
self.memory = EpisodicExperienceReplayParameters()
@property
def path(self):
return 'rl_coach.agents.mmc_agent:MixedMonteCarloAgent'
class MixedMonteCarloAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# for the 1-step, we use the double-dqn target. hence actions are taken greedily according to the online network
selected_actions = np.argmax(self.networks['main'].online_network.predict(batch.next_states(network_keys)), 1)
# TD_targets are initialized with the current prediction so that we will
# only update the action that we have actually done in this transition
q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
for i in range(self.ap.network_wrappers['main'].batch_size):
one_step_target = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
q_st_plus_1[i][selected_actions[i]]
monte_carlo_target = batch.total_returns()[i]
TD_targets[i, batch.actions()[i]] = (1 - self.mixing_rate) * one_step_target + \
self.mixing_rate * monte_carlo_target
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.q_head import QHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
InputEmbedderParameters
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.utils import last_sample
from rl_coach.core_types import EnvironmentSteps
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class NStepQNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [QHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.async_training = True
self.shared_optimizer = True
self.create_target_network = True
class NStepQAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
self.apply_gradients_every_x_episodes = 1
self.num_steps_between_gradient_updates = 5 # this is called t_max in all the papers
self.targets_horizon = 'N-Step'
class NStepQAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NStepQAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=SingleEpisodeBufferParameters(),
networks={"main": NStepQNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.n_step_q_agent:NStepQAgent'
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.last_gradient_update_step_idx = 0
self.q_values = self.register_signal('Q Values')
self.value_loss = self.register_signal('Value Loss')
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the values for the current states
state_value_head_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
# the targets for the state value estimator
if self.ap.algorithm.targets_horizon == '1-Step':
# 1-Step Q learning
q_st_plus_1 = self.networks['main'].target_network.predict(batch.next_states(network_keys))
for i in reversed(range(batch.size)):
state_value_head_targets[i][batch.actions()[i]] = \
batch.rewards()[i] \
+ (1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * np.max(q_st_plus_1[i], 0)
elif self.ap.algorithm.targets_horizon == 'N-Step':
# N-Step Q learning
if batch.game_overs()[-1]:
R = 0
else:
R = np.max(self.networks['main'].target_network.predict(last_sample(batch.next_states(network_keys))))
for i in reversed(range(batch.size)):
R = batch.rewards()[i] + self.ap.algorithm.discount * R
state_value_head_targets[i][batch.actions()[i]] = R
else:
assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
# train
result = self.networks['main'].online_network.accumulate_gradients(batch.states(network_keys), [state_value_head_targets])
# logging
total_loss, losses, unclipped_grads = result[:3]
self.value_loss.add_sample(losses[0])
return total_loss, losses, unclipped_grads
def train(self):
# update the target network of every network that has a target network
if any([network.has_target for network in self.networks.values()]) \
and self._should_update_online_weights_to_target():
for network in self.networks.values():
network.update_target_network(self.ap.algorithm.rate_for_copying_weights_to_target)
self.agent_logger.create_signal_value('Update Target Network', 1)
else:
self.agent_logger.create_signal_value('Update Target Network', 0, overwrite=False)
return PolicyOptimizationAgent.train(self)

View File

@@ -0,0 +1,126 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.naf_head import NAFHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, \
NetworkParameters, InputEmbedderParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import BoxActionSpace
from rl_coach.core_types import ActionInfo, EnvironmentSteps
from rl_coach.exploration_policies.ou_process import OUProcessParameters
class NAFNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [NAFHeadParameters()]
self.loss_weights = [1.0]
self.optimizer_type = 'Adam'
self.learning_rate = 0.001
self.async_training = True
self.create_target_network = True
class NAFAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.num_consecutive_training_steps = 5
self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1)
self.rate_for_copying_weights_to_target = 0.001
class NAFAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NAFAlgorithmParameters(),
exploration=OUProcessParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"main": NAFNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.naf_agent:NAFAgent'
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
class NAFAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.l_values = self.register_signal("L")
self.a_values = self.register_signal("Advantage")
self.mu_values = self.register_signal("Action")
self.v_values = self.register_signal("V")
self.TD_targets = self.register_signal("TD targets")
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# TD error = r + discount*v_st_plus_1 - q_st
v_st_plus_1 = self.networks['main'].target_network.predict(
batch.next_states(network_keys),
self.networks['main'].target_network.output_heads[0].V,
squeeze_output=False,
)
TD_targets = np.expand_dims(batch.rewards(), -1) + \
(1.0 - np.expand_dims(batch.game_overs(), -1)) * self.ap.algorithm.discount * v_st_plus_1
self.TD_targets.add_sample(TD_targets)
result = self.networks['main'].train_and_sync_networks({**batch.states(network_keys),
'output_0_0': batch.actions(len(batch.actions().shape) == 1)
}, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads
def choose_action(self, curr_state):
if type(self.spaces.action) != BoxActionSpace:
raise ValueError('NAF works only for continuous control problems')
# convert to batch so we can run it through the network
tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
naf_head = self.networks['main'].online_network.output_heads[0]
action_values = self.networks['main'].online_network.predict(tf_input_state, outputs=naf_head.mu,
squeeze_output=False)
# get the actual action to use
action = self.exploration_policy.get_action(action_values)
# get the internal values for logging
outputs = [naf_head.mu, naf_head.Q, naf_head.L, naf_head.A, naf_head.V]
result = self.networks['main'].online_network.predict(
{**tf_input_state, 'output_0_0': action_values},
outputs=outputs
)
mu, Q, L, A, V = result
# store the q values statistics for logging
self.q_values.add_sample(Q)
self.l_values.add_sample(L)
self.a_values.add_sample(A)
self.mu_values.add_sample(mu)
self.v_values.add_sample(V)
action_info = ActionInfo(action=action, action_value=Q)
return action_info

View File

@@ -0,0 +1,176 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import pickle
from typing import Union
import numpy as np
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.architectures.tensorflow_components.heads.dnd_q_head import DNDQHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, AgentParameters, \
InputEmbedderParameters
from rl_coach.core_types import RunPhase, EnvironmentSteps, Episode, StateType
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters, MemoryGranularity
from rl_coach.schedules import ConstantSchedule
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.logger import screen
class NECNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [DNDQHeadParameters()]
self.loss_weights = [1.0]
self.rescale_gradient_from_head_by_factor = [1]
self.optimizer_type = 'Adam'
class NECAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.dnd_size = 500000
self.l2_norm_added_delta = 0.001
self.new_value_shift_coefficient = 0.1
self.number_of_knn = 50
self.DND_key_error_threshold = 0
self.num_consecutive_playing_steps = EnvironmentSteps(4)
self.propagate_updates_to_DND = False
self.n_step = 100
self.bootstrap_total_return_from_old_policy = True
class NECMemoryParameters(EpisodicExperienceReplayParameters):
def __init__(self):
super().__init__()
self.max_size = (MemoryGranularity.Transitions, 100000)
class NECAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=NECAlgorithmParameters(),
exploration=EGreedyParameters(),
memory=NECMemoryParameters(),
networks={"main": NECNetworkParameters()})
self.exploration.epsilon_schedule = ConstantSchedule(0.1)
self.exploration.evaluation_epsilon = 0.01
@property
def path(self):
return 'rl_coach.agents.nec_agent:NECAgent'
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
class NECAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.current_episode_state_embeddings = []
self.training_started = False
self.current_episode_buffer = \
Episode(discount=self.ap.algorithm.discount,
n_step=self.ap.algorithm.n_step,
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
def learn_from_batch(self, batch):
if not self.networks['main'].online_network.output_heads[0].DND.has_enough_entries(self.ap.algorithm.number_of_knn):
return 0, [], 0
else:
if not self.training_started:
self.training_started = True
screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
TD_targets = self.networks['main'].online_network.predict(batch.states(network_keys))
# only update the action that we have actually done in this transition
for i in range(self.ap.network_wrappers['main'].batch_size):
TD_targets[i, batch.actions()[i]] = batch.total_returns()[i]
# set the gradients to fetch for the DND update
fetches = []
head = self.networks['main'].online_network.output_heads[0]
if self.ap.algorithm.propagate_updates_to_DND:
fetches = [head.dnd_embeddings_grad, head.dnd_values_grad, head.dnd_indices]
# train the neural network
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets, fetches)
total_loss, losses, unclipped_grads = result[:3]
# update the DND keys and values using the extracted gradients
if self.ap.algorithm.propagate_updates_to_DND:
embedding_gradients = np.swapaxes(result[-1][0], 0, 1)
value_gradients = np.swapaxes(result[-1][1], 0, 1)
indices = np.swapaxes(result[-1][2], 0, 1)
head.DND.update_keys_and_values(batch.actions(), embedding_gradients, value_gradients, indices)
return total_loss, losses, unclipped_grads
def act(self):
if self.phase == RunPhase.HEATUP:
# get embedding in heatup (otherwise we get it through get_prediction)
embedding = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(self.curr_state, 'main'),
outputs=self.networks['main'].online_network.state_embedding)
self.current_episode_state_embeddings.append(embedding)
return super().act()
def get_all_q_values_for_states(self, states: StateType):
# we need to store the state embeddings regardless if the action is random or not
return self.get_prediction(states)
def get_prediction(self, states):
# get the actions q values and the state embedding
embedding, actions_q_values = self.networks['main'].online_network.predict(
self.prepare_batch_for_inference(states, 'main'),
outputs=[self.networks['main'].online_network.state_embedding,
self.networks['main'].online_network.output_heads[0].output]
)
if self.phase != RunPhase.TEST:
# store the state embedding for inserting it to the DND later
self.current_episode_state_embeddings.append(embedding.squeeze())
actions_q_values = actions_q_values[0][0]
return actions_q_values
def reset_internal_state(self):
super().reset_internal_state()
self.current_episode_state_embeddings = []
self.current_episode_buffer = \
Episode(discount=self.ap.algorithm.discount,
n_step=self.ap.algorithm.n_step,
bootstrap_total_return_from_old_policy=self.ap.algorithm.bootstrap_total_return_from_old_policy)
def handle_episode_ended(self):
super().handle_episode_ended()
# get the last full episode that we have collected
episode = self.call_memory('get_last_complete_episode')
if episode is not None and self.phase != RunPhase.TEST:
assert len(self.current_episode_state_embeddings) == episode.length()
returns = episode.get_transitions_attribute('total_return')
actions = episode.get_transitions_attribute('action')
self.networks['main'].online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
actions, returns)
def save_checkpoint(self, checkpoint_id):
with open(os.path.join(self.ap.task_parameters.save_checkpoint_dir, str(checkpoint_id) + '.dnd'), 'wb') as f:
pickle.dump(self.networks['main'].online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL)

View File

@@ -0,0 +1,94 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay, \
EpisodicExperienceReplayParameters
class PALAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.pal_alpha = 0.9
self.persistent_advantage_learning = False
self.monte_carlo_mixing_rate = 0.1
class PALAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = PALAlgorithmParameters()
self.memory = EpisodicExperienceReplayParameters()
@property
def path(self):
return 'rl_coach.agents.pal_agent:PALAgent'
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
class PALAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.alpha = agent_parameters.algorithm.pal_alpha
self.persistent = agent_parameters.algorithm.persistent_advantage_learning
self.monte_carlo_mixing_rate = agent_parameters.algorithm.monte_carlo_mixing_rate
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# next state values
q_st_plus_1_target, q_st_plus_1_online = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.next_states(network_keys))
])
selected_actions = np.argmax(q_st_plus_1_online, 1)
v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
# current state values
q_st_target, q_st_online = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
v_st_target = np.max(q_st_target, 1)
# calculate TD error
TD_targets = np.copy(q_st_online)
for i in range(self.ap.network_wrappers['main'].batch_size):
TD_targets[i, batch.actions()[i]] = batch.rewards()[i] + \
(1.0 - batch.game_overs()[i]) * self.ap.algorithm.discount * \
q_st_plus_1_target[i][selected_actions[i]]
advantage_learning_update = v_st_target[i] - q_st_target[i, batch.actions()[i]]
next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
# Persistent Advantage Learning or Regular Advantage Learning
if self.persistent:
TD_targets[i, batch.actions()[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
else:
TD_targets[i, batch.actions()[i]] -= self.alpha * advantage_learning_update
# mixing monte carlo updates
monte_carlo_target = batch.total_returns()[i]
TD_targets[i, batch.actions()[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, batch.actions()[i]] \
+ self.monte_carlo_mixing_rate * monte_carlo_target
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,105 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.policy_optimization_agent import PolicyOptimizationAgent, PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.policy_head import PolicyHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import NetworkParameters, AlgorithmParameters, \
AgentParameters, InputEmbedderParameters
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.logger import screen
from rl_coach.memories.episodic.single_episode_buffer import SingleEpisodeBufferParameters
class PolicyGradientNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters()}
self.middleware_parameters = FCMiddlewareParameters()
self.heads_parameters = [PolicyHeadParameters()]
self.loss_weights = [1.0]
self.async_training = True
class PolicyGradientAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP
self.apply_gradients_every_x_episodes = 5
self.beta_entropy = 0
self.num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
class PolicyGradientsAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=PolicyGradientAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=SingleEpisodeBufferParameters(),
networks={"main": PolicyGradientNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.policy_gradients_agent:PolicyGradientsAgent'
class PolicyGradientsAgent(PolicyOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.returns_mean = self.register_signal('Returns Mean')
self.returns_variance = self.register_signal('Returns Variance')
self.last_gradient_update_step_idx = 0
def learn_from_batch(self, batch):
# batch contains a list of episodes to learn from
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
total_returns = batch.total_returns()
for i in reversed(range(batch.size)):
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
total_returns[i] = total_returns[0]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
# just take the total return as it is
pass
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
if self.std_discounted_return != 0:
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
else:
total_returns[i] = 0
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
targets = total_returns
actions = batch.actions()
if type(self.spaces.action) != DiscreteActionSpace and len(actions.shape) < 2:
actions = np.expand_dims(actions, -1)
self.returns_mean.add_sample(np.mean(total_returns))
self.returns_variance.add_sample(np.std(total_returns))
result = self.networks['main'].online_network.accumulate_gradients(
{**batch.states(network_keys), 'output_0_0': actions}, targets
)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,166 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from enum import Enum
from typing import Union
import numpy as np
from rl_coach.core_types import Batch, ActionInfo
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
from rl_coach.utils import eps
from rl_coach.agents.agent import Agent
from rl_coach.logger import screen
class PolicyGradientRescaler(Enum):
TOTAL_RETURN = 0
FUTURE_RETURN = 1
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined
Q_VALUE = 4
A_VALUE = 5
TD_RESIDUAL = 6
DISCOUNTED_TD_RESIDUAL = 7
GAE = 8
## This is an abstract agent - there is no learn_from_batch method ##
class PolicyOptimizationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.policy_gradient_rescaler = None
if hasattr(self.ap.algorithm, 'policy_gradient_rescaler'):
self.policy_gradient_rescaler = self.ap.algorithm.policy_gradient_rescaler
# statistics for variance reduction
self.last_gradient_update_step_idx = 0
self.max_episode_length = 100000
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
self.entropy = self.register_signal('Entropy')
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Name"] = self.full_name_id
if self.task_id is not None:
log["Worker"] = self.task_id
log["Episode"] = self.current_episode
log["Total reward"] = round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
log["Training iteration"] = self.training_iteration
screen.log_dict(log, prefix=self.phase.value)
def update_episode_statistics(self, episode):
episode_discounted_returns = []
for i in range(episode.length()):
transition = episode.get_transition(i)
episode_discounted_returns.append(transition.total_return)
self.num_episodes_where_step_has_been_seen[i] += 1
self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
self.num_episodes_where_step_has_been_seen[i]
self.mean_return_over_multiple_episodes[i] += transition.total_return / \
self.num_episodes_where_step_has_been_seen[i]
self.mean_discounted_return = np.mean(episode_discounted_returns)
self.std_discounted_return = np.std(episode_discounted_returns)
def get_current_episode(self):
# we get the episode most of the time from the current episode buffer and only in the last transition from the
# "memory" (where is was stored in the end of the episode)
return self.memory.get_episode(0) or self.current_episode_buffer
def train(self):
episode = self.get_current_episode()
# check if we should calculate gradients or skip
episode_ended = episode.is_complete
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.ap.algorithm.num_steps_between_gradient_updates
if not (is_t_max_steps_passed or episode_ended):
return 0
total_loss = 0
if num_steps_passed_since_last_update > 0:
# we need to update the returns of the episode until now
episode.update_returns()
# get t_max transitions or less if the we got to a terminal state
# will be used for both actor-critic and vanilla PG.
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
transitions = []
start_idx = self.last_gradient_update_step_idx
end_idx = episode.length()
for idx in range(start_idx, end_idx):
transitions.append(episode.get_transition(idx))
self.last_gradient_update_step_idx = end_idx
# update the statistics for the variance reduction techniques
if self.policy_gradient_rescaler in \
[PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE,
PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP]:
self.update_episode_statistics(episode)
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
batch = Batch(transitions)
total_loss, losses, unclipped_grads = self.learn_from_batch(batch)
if self.current_episode % self.ap.algorithm.apply_gradients_every_x_episodes == 0:
for network in self.networks.values():
network.apply_gradients_and_sync_networks()
self.training_iteration += 1
# move the pointer to the next episode start and discard the episode.
if episode_ended:
# we need to remove the episode, because the next training iteration will be called before storing any
# additional transitions in the memory (we don't store a transition for the first call to observe), so the
# length of the memory won't be enforced and the old episode won't be removed
self.call_memory('remove_episode', 0)
self.last_gradient_update_step_idx = 0
return total_loss
def learn_from_batch(self, batch):
raise NotImplementedError("PolicyOptimizationAgent is an abstract agent. Not to be used directly.")
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "main")
return self.networks['main'].online_network.predict(tf_input_state)
def choose_action(self, curr_state):
# convert to batch so we can run it through the network
action_values = self.get_prediction(curr_state)
if isinstance(self.spaces.action, DiscreteActionSpace):
# DISCRETE
action_probabilities = np.array(action_values).squeeze()
action = self.exploration_policy.get_action(action_probabilities)
action_info = ActionInfo(action=action,
action_probability=action_probabilities[action])
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps)))
elif isinstance(self.spaces.action, BoxActionSpace):
# CONTINUOUS
action = self.exploration_policy.get_action(action_values)
action_info = ActionInfo(action=action)
else:
raise ValueError("The action space of the environment is not compatible with the algorithm")
return action_info

View File

@@ -0,0 +1,338 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from collections import OrderedDict
from typing import Union
import numpy as np
from rl_coach.agents.actor_critic_agent import ActorCriticAgent
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
from rl_coach.architectures.tensorflow_components.heads.v_head import VHeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
from rl_coach.base_parameters import AlgorithmParameters, NetworkParameters, \
AgentParameters, InputEmbedderParameters, DistributedTaskParameters
from rl_coach.core_types import EnvironmentSteps, Batch
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.utils import force_list
from rl_coach.architectures.tensorflow_components.heads.ppo_head import PPOHeadParameters
from rl_coach.logger import screen
class PPOCriticNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [VHeadParameters()]
self.loss_weights = [1.0]
self.async_training = True
self.l2_regularization = 0
self.create_target_network = True
self.batch_size = 128
class PPOActorNetworkParameters(NetworkParameters):
def __init__(self):
super().__init__()
self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='tanh')}
self.middleware_parameters = FCMiddlewareParameters(activation_function='tanh')
self.heads_parameters = [PPOHeadParameters()]
self.optimizer_type = 'Adam'
self.loss_weights = [1.0]
self.async_training = True
self.l2_regularization = 0
self.create_target_network = True
self.batch_size = 128
class PPOAlgorithmParameters(AlgorithmParameters):
def __init__(self):
super().__init__()
self.policy_gradient_rescaler = PolicyGradientRescaler.GAE
self.gae_lambda = 0.96
self.target_kl_divergence = 0.01
self.initial_kl_coefficient = 1.0
self.high_kl_penalty_coefficient = 1000
self.clip_likelihood_ratio_using_epsilon = None
self.value_targets_mix_fraction = 0.1
self.estimate_state_value_using_gae = True
self.step_until_collecting_full_episodes = True
self.use_kl_regularization = True
self.beta_entropy = 0.01
self.num_consecutive_playing_steps = EnvironmentSteps(5000)
class PPOAgentParameters(AgentParameters):
def __init__(self):
super().__init__(algorithm=PPOAlgorithmParameters(),
exploration=AdditiveNoiseParameters(),
memory=EpisodicExperienceReplayParameters(),
networks={"critic": PPOCriticNetworkParameters(), "actor": PPOActorNetworkParameters()})
@property
def path(self):
return 'rl_coach.agents.ppo_agent:PPOAgent'
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
class PPOAgent(ActorCriticAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
# signals definition
self.value_loss = self.register_signal('Value Loss')
self.policy_loss = self.register_signal('Policy Loss')
self.kl_divergence = self.register_signal('KL Divergence')
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = self.register_signal('Grads (unclipped)')
def fill_advantages(self, batch):
batch = Batch(batch)
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# * Found not to have any impact *
# current_states_with_timestep = self.concat_state_and_timestep(batch)
current_state_values = self.networks['critic'].online_network.predict(batch.states(network_keys)).squeeze()
# calculate advantages
advantages = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
advantages = batch.total_returns() - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
# current_state_values[batch.game_overs()] = 0
for idx, game_over in enumerate(batch.game_overs()):
if game_over:
# get advantages for the rollout
value_bootstrapping = np.zeros((1,))
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
rollout_advantages, _ = \
self.get_general_advantage_estimation_values(batch.rewards()[episode_start_idx:idx+1],
rollout_state_values)
episode_start_idx = idx + 1
advantages = np.append(advantages, rollout_advantages)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
# TODO: this will be problematic with a shared memory
for transition, advantage in zip(self.memory.transitions, advantages):
transition.info['advantage'] = advantage
self.action_advantages.add_sample(advantages)
def train_value_network(self, dataset, epochs):
loss = []
batch = Batch(dataset)
network_keys = self.ap.network_wrappers['critic'].input_embedders_parameters.keys()
# * Found not to have any impact *
# add a timestep to the observation
# current_states_with_timestep = self.concat_state_and_timestep(dataset)
mix_fraction = self.ap.algorithm.value_targets_mix_fraction
for j in range(epochs):
curr_batch_size = batch.size
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
curr_batch_size = self.ap.network_wrappers['critic'].batch_size
for i in range(batch.size // curr_batch_size):
# split to batches for first order optimization techniques
current_states_batch = {
k: v[i * curr_batch_size:(i + 1) * curr_batch_size]
for k, v in batch.states(network_keys).items()
}
total_return_batch = batch.total_returns(True)[i * curr_batch_size:(i + 1) * curr_batch_size]
old_policy_values = force_list(self.networks['critic'].target_network.predict(
current_states_batch).squeeze())
if self.networks['critic'].online_network.optimizer_type != 'LBFGS':
targets = total_return_batch
else:
current_values = self.networks['critic'].online_network.predict(current_states_batch)
targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
inputs = copy.copy(current_states_batch)
for input_index, input in enumerate(old_policy_values):
name = 'output_0_{}'.format(input_index)
if name in self.networks['critic'].online_network.inputs:
inputs[name] = input
value_loss = self.networks['critic'].online_network.accumulate_gradients(inputs, targets)
self.networks['critic'].apply_gradients_to_online_network()
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
self.networks['critic'].apply_gradients_to_global_network()
self.networks['critic'].online_network.reset_accumulated_gradients()
loss.append([value_loss[0]])
loss = np.mean(loss, 0)
return loss
def concat_state_and_timestep(self, dataset):
current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
for transition in dataset]
current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
return current_states_with_timestep
def train_policy_network(self, dataset, epochs):
loss = []
for j in range(epochs):
loss = {
'total_loss': [],
'policy_losses': [],
'unclipped_grads': [],
'fetch_result': []
}
#shuffle(dataset)
for i in range(len(dataset) // self.ap.network_wrappers['actor'].batch_size):
batch = Batch(dataset[i * self.ap.network_wrappers['actor'].batch_size:
(i + 1) * self.ap.network_wrappers['actor'].batch_size])
network_keys = self.ap.network_wrappers['actor'].input_embedders_parameters.keys()
advantages = batch.info('advantage')
actions = batch.actions()
if not isinstance(self.spaces.action, DiscreteActionSpace) and len(actions.shape) == 1:
actions = np.expand_dims(actions, -1)
# get old policy probabilities and distribution
old_policy = force_list(self.networks['actor'].target_network.predict(batch.states(network_keys)))
# calculate gradients and apply on both the local policy network and on the global policy network
fetches = [self.networks['actor'].online_network.output_heads[0].kl_divergence,
self.networks['actor'].online_network.output_heads[0].entropy]
inputs = copy.copy(batch.states(network_keys))
inputs['output_0_0'] = actions
# old_policy_distribution needs to be represented as a list, because in the event of discrete controls,
# it has just a mean. otherwise, it has both a mean and standard deviation
for input_index, input in enumerate(old_policy):
inputs['output_0_{}'.format(input_index + 1)] = input
total_loss, policy_losses, unclipped_grads, fetch_result =\
self.networks['actor'].online_network.accumulate_gradients(
inputs, [advantages], additional_fetches=fetches)
self.networks['actor'].apply_gradients_to_online_network()
if isinstance(self.ap.task_parameters, DistributedTaskParameters):
self.networks['actor'].apply_gradients_to_global_network()
self.networks['actor'].online_network.reset_accumulated_gradients()
loss['total_loss'].append(total_loss)
loss['policy_losses'].append(policy_losses)
loss['unclipped_grads'].append(unclipped_grads)
loss['fetch_result'].append(fetch_result)
self.unclipped_grads.add_sample(unclipped_grads)
for key in loss.keys():
loss[key] = np.mean(loss[key], 0)
if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0:
curr_learning_rate = self.networks['critic'].online_network.get_variable_value(self.ap.learning_rate)
self.curr_learning_rate.add_sample(curr_learning_rate)
else:
curr_learning_rate = self.ap.network_wrappers['critic'].learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
("Surrogate loss", loss['policy_losses'][0]),
("KL divergence", loss['fetch_result'][0]),
("Entropy", loss['fetch_result'][1]),
("training epoch", j),
("learning_rate", curr_learning_rate)
]),
prefix="Policy training"
)
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
self.entropy.add_sample(loss['fetch_result'][1])
self.kl_divergence.add_sample(loss['fetch_result'][0])
return loss['total_loss']
def update_kl_coefficient(self):
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
# his implementation for now because we know it works well
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
# update kl coefficient
kl_target = self.ap.algorithm.target_kl_divergence
kl_coefficient = self.networks['actor'].online_network.get_variable_value(
self.networks['actor'].online_network.output_heads[0].kl_coefficient)
new_kl_coefficient = kl_coefficient
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
# kl too high => increase regularization
new_kl_coefficient *= 1.5
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
# kl too low => decrease regularization
new_kl_coefficient /= 1.5
# update the kl coefficient variable
if kl_coefficient != new_kl_coefficient:
self.networks['actor'].online_network.set_variable_value(
self.networks['actor'].online_network.output_heads[0].assign_kl_coefficient,
new_kl_coefficient,
self.networks['actor'].online_network.output_heads[0].kl_coefficient_ph)
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
def post_training_commands(self):
if self.ap.algorithm.use_kl_regularization:
self.update_kl_coefficient()
# clean memory
self.call_memory('clean')
def train(self):
loss = 0
if self._should_train(wait_for_full_episode=True):
for training_step in range(self.ap.algorithm.num_consecutive_training_steps):
self.networks['actor'].sync()
self.networks['critic'].sync()
dataset = self.memory.transitions
self.fill_advantages(dataset)
# take only the requested number of steps
dataset = dataset[:self.ap.algorithm.num_consecutive_playing_steps.num_steps]
value_loss = self.train_value_network(dataset, 1)
policy_loss = self.train_policy_network(dataset, 10)
self.value_loss.add_sample(value_loss)
self.policy_loss.add_sample(policy_loss)
self.post_training_commands()
self.training_iteration += 1
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return np.append(value_loss, policy_loss)
def get_prediction(self, states):
tf_input_state = self.prepare_batch_for_inference(states, "actor")
return self.networks['actor'].online_network.predict(tf_input_state)

View File

@@ -0,0 +1,112 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.architectures.tensorflow_components.heads.quantile_regression_q_head import QuantileRegressionQHeadParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.agents.dqn_agent import DQNAgentParameters, DQNNetworkParameters, DQNAlgorithmParameters
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.core_types import StateType
class QuantileRegressionDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.heads_parameters = [QuantileRegressionQHeadParameters()]
self.learning_rate = 0.00005
self.optimizer_epsilon = 0.01 / 32
class QuantileRegressionDQNAlgorithmParameters(DQNAlgorithmParameters):
def __init__(self):
super().__init__()
self.atoms = 200
self.huber_loss_interval = 1 # called k in the paper
class QuantileRegressionDQNAgentParameters(DQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = QuantileRegressionDQNAlgorithmParameters()
self.network_wrappers = {"main": QuantileRegressionDQNNetworkParameters()}
self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
self.exploration.evaluation_epsilon = 0.001
@property
def path(self):
return 'rl_coach.agents.qr_dqn_agent:QuantileRegressionDQNAgent'
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.quantile_probabilities = np.ones(self.ap.algorithm.atoms) / float(self.ap.algorithm.atoms)
def get_q_values(self, quantile_values):
return np.dot(quantile_values, self.quantile_probabilities)
# prediction's format is (batch,actions,atoms)
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
quantile_values = self.get_prediction(states)
actions_q_values = self.get_q_values(quantile_values)
else:
actions_q_values = None
return actions_q_values
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
# get the quantiles of the next states and current states
next_state_quantiles, current_quantiles = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# get the optimal actions to take for the next states
target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1)
# calculate the Bellman update
batch_idx = list(range(self.ap.network_wrappers['main'].batch_size))
TD_targets = batch.rewards(True) + (1.0 - batch.game_overs(True)) * self.ap.algorithm.discount \
* next_state_quantiles[batch_idx, target_actions]
# get the locations of the selected actions within the batch for indexing purposes
actions_locations = [[b, a] for b, a in zip(batch_idx, batch.actions())]
# calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order
cumulative_probabilities = np.array(range(self.ap.algorithm.atoms + 1)) / float(self.ap.algorithm.atoms) # tau_i
quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1]) # tau^hat_i
quantile_midpoints = np.tile(quantile_midpoints, (self.ap.network_wrappers['main'].batch_size, 1))
sorted_quantiles = np.argsort(current_quantiles[batch_idx, batch.actions()])
for idx in range(self.ap.network_wrappers['main'].batch_size):
quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]]
# train
result = self.networks['main'].train_and_sync_networks({
**batch.states(network_keys),
'output_0_0': actions_locations,
'output_0_1': quantile_midpoints,
}, TD_targets)
total_loss, losses, unclipped_grads = result[:3]
return total_loss, losses, unclipped_grads

View File

@@ -0,0 +1,98 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplay
from rl_coach.spaces import DiscreteActionSpace
from rl_coach.agents.agent import Agent
from rl_coach.core_types import ActionInfo, StateType
## This is an abstract agent - there is no learn_from_batch method ##
class ValueOptimizationAgent(Agent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
self.q_values = self.register_signal("Q")
self.q_value_for_action = {}
def init_environment_dependent_modules(self):
super().init_environment_dependent_modules()
if isinstance(self.spaces.action, DiscreteActionSpace):
for i in range(len(self.spaces.action.actions)):
self.q_value_for_action[i] = self.register_signal("Q for action {}".format(i),
dump_one_value_per_episode=False,
dump_one_value_per_step=True)
# Algorithms for which q_values are calculated from predictions will override this function
def get_all_q_values_for_states(self, states: StateType):
if self.exploration_policy.requires_action_values():
actions_q_values = self.get_prediction(states)
else:
actions_q_values = None
return actions_q_values
def get_prediction(self, states):
return self.networks['main'].online_network.predict(self.prepare_batch_for_inference(states, 'main'))
def update_transition_priorities_and_get_weights(self, TD_errors, batch):
# update errors in prioritized replay buffer
importance_weights = None
if isinstance(self.memory, PrioritizedExperienceReplay):
self.call_memory('update_priorities', (batch.info('idx'), TD_errors))
importance_weights = batch.info('weight')
return importance_weights
def _validate_action(self, policy, action):
if np.array(action).shape != ():
raise ValueError((
'The exploration_policy {} returned a vector of actions '
'instead of a single action. ValueOptimizationAgents '
'require exploration policies which return a single action.'
).format(policy.__class__.__name__))
def choose_action(self, curr_state):
actions_q_values = self.get_all_q_values_for_states(curr_state)
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
action = self.exploration_policy.get_action(actions_q_values)
self._validate_action(self.exploration_policy, action)
if actions_q_values is not None:
# this is for bootstrapped dqn
if type(actions_q_values) == list and len(actions_q_values) > 0:
actions_q_values = self.exploration_policy.last_action_values
actions_q_values = actions_q_values.squeeze()
# store the q values statistics for logging
self.q_values.add_sample(actions_q_values)
for i, q_value in enumerate(actions_q_values):
self.q_value_for_action[i].add_sample(q_value)
action_info = ActionInfo(action=action,
action_value=actions_q_values[action],
max_action_value=np.max(actions_q_values))
else:
action_info = ActionInfo(action=action)
return action_info
def learn_from_batch(self, batch):
raise NotImplementedError("ValueOptimizationAgent is an abstract agent. Not to be used directly.")

View File

@@ -0,0 +1,15 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,71 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
class Architecture(object):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= ""):
"""
:param agent_parameters: the agent parameters
:param spaces: the spaces (observation, action, etc.) definition of the agent
:param name: the name of the network
"""
# spaces
self.spaces = spaces
self.name = name
self.network_wrapper_name = self.name.split('/')[0] # the name can be main/online and the network_wrapper_name will be main
self.full_name = "{}/{}".format(agent_parameters.full_name_id, name)
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
self.batch_size = self.network_parameters.batch_size
self.learning_rate = self.network_parameters.learning_rate
self.optimizer = None
self.ap = agent_parameters
def get_model(self):
pass
def predict(self, inputs):
pass
def train_on_batch(self, inputs, targets):
pass
def get_weights(self):
pass
def set_weights(self, weights, rate=1.0):
pass
def reset_accumulated_gradients(self):
pass
def accumulate_gradients(self, inputs, targets):
pass
def apply_and_reset_gradients(self, gradients):
pass
def apply_gradients(self, gradients):
pass
def get_variable_value(self, variable):
pass
def set_variable_value(self, assign_op, value, placeholder=None):
pass

View File

@@ -0,0 +1,210 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List, Tuple
from rl_coach.base_parameters import Frameworks, AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.logger import failed_imports
try:
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork
except ImportError:
failed_imports.append("TensorFlow")
class NetworkWrapper(object):
"""
Contains multiple networks and managers syncing and gradient updates
between them.
"""
def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str,
spaces: SpacesDefinition, replicated_device=None, worker_device=None):
self.ap = agent_parameters
self.network_parameters = self.ap.network_wrappers[name]
self.has_target = has_target
self.has_global = has_global
self.name = name
self.sess = None
if self.network_parameters.framework == Frameworks.tensorflow:
general_network = GeneralTensorFlowNetwork
else:
raise Exception("{} Framework is not supported"
.format(Frameworks().to_string(self.network_parameters.framework)))
with tf.variable_scope("{}/{}".format(self.ap.full_name_id, name)):
# Global network - the main network shared between threads
self.global_network = None
if self.has_global:
# we assign the parameters of this network on the parameters server
with tf.device(replicated_device):
self.global_network = general_network(agent_parameters=agent_parameters,
name='{}/global'.format(name),
global_network=None,
network_is_local=False,
spaces=spaces,
network_is_trainable=True)
# Online network - local copy of the main network used for playing
self.online_network = None
with tf.device(worker_device):
self.online_network = general_network(agent_parameters=agent_parameters,
name='{}/online'.format(name),
global_network=self.global_network,
network_is_local=True,
spaces=spaces,
network_is_trainable=True)
# Target network - a local, slow updating network used for stabilizing the learning
self.target_network = None
if self.has_target:
with tf.device(worker_device):
self.target_network = general_network(agent_parameters=agent_parameters,
name='{}/target'.format(name),
global_network=self.global_network,
network_is_local=True,
spaces=spaces,
network_is_trainable=False)
def sync(self):
"""
Initializes the weights of the networks to match each other
:return:
"""
self.update_online_network()
self.update_target_network()
def update_target_network(self, rate=1.0):
"""
Copy weights: online network >>> target network
:param rate: the rate of copying the weights - 1 for copying exactly
"""
if self.target_network:
self.target_network.set_weights(self.online_network.get_weights(), rate)
def update_online_network(self, rate=1.0):
"""
Copy weights: global network >>> online network
:param rate: the rate of copying the weights - 1 for copying exactly
"""
if self.global_network:
self.online_network.set_weights(self.global_network.get_weights(), rate)
def apply_gradients_to_global_network(self, gradients=None):
"""
Apply gradients from the online network on the global network
:param gradients: optional gradients that will be used instead of teh accumulated gradients
:return:
"""
if gradients is None:
gradients = self.online_network.accumulated_gradients
if self.network_parameters.shared_optimizer:
self.global_network.apply_gradients(gradients)
else:
self.online_network.apply_gradients(gradients)
def apply_gradients_to_online_network(self, gradients=None):
"""
Apply gradients from the online network on itself
:return:
"""
if gradients is None:
gradients = self.online_network.accumulated_gradients
self.online_network.apply_gradients(gradients)
def train_and_sync_networks(self, inputs, targets, additional_fetches=[], importance_weights=None):
"""
A generic training function that enables multi-threading training using a global network if necessary.
:param inputs: The inputs for the network.
:param targets: The targets corresponding to the given inputs
:param additional_fetches: Any additional tensor the user wants to fetch
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won't be scaled
:return: The loss of the training iteration
"""
result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
importance_weights=importance_weights, no_accumulation=True)
self.apply_gradients_and_sync_networks(reset_gradients=False)
return result
def apply_gradients_and_sync_networks(self, reset_gradients=True):
"""
Applies the gradients accumulated in the online network to the global network or to itself and syncs the
networks if necessary
:param reset_gradients: If set to True, the accumulated gradients wont be reset to 0 after applying them to
the network. this is useful when the accumulated gradients are overwritten instead
if accumulated by the accumulate_gradients function. this allows reducing time
complexity for this function by around 10%
"""
if self.global_network:
self.apply_gradients_to_global_network()
if reset_gradients:
self.online_network.reset_accumulated_gradients()
self.update_online_network()
else:
if reset_gradients:
self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients)
else:
self.online_network.apply_gradients(self.online_network.accumulated_gradients)
def parallel_prediction(self, network_input_tuples: List[Tuple]):
"""
Run several network prediction in parallel. Currently this only supports running each of the network once.
:param network_input_tuples: a list of tuples where the first element is the network (online_network,
target_network or global_network) and the second element is the inputs
:return: the outputs of all the networks in the same order as the inputs were given
"""
feed_dict = {}
fetches = []
for idx, (network, input) in enumerate(network_input_tuples):
feed_dict.update(network.create_feed_dict(input))
fetches += network.outputs
outputs = self.sess.run(fetches, feed_dict)
return outputs
def get_local_variables(self):
"""
Get all the variables that are local to the thread
:return: a list of all the variables that are local to the thread
"""
local_variables = [v for v in tf.local_variables() if self.online_network.name in v.name]
if self.has_target:
local_variables += [v for v in tf.local_variables() if self.target_network.name in v.name]
return local_variables
def get_global_variables(self):
"""
Get all the variables that are shared between threads
:return: a list of all the variables that are shared between threads
"""
global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name]
return global_variables
def set_session(self, sess):
self.sess = sess
self.online_network.set_session(sess)
if self.global_network:
self.global_network.set_session(sess)
if self.target_network:
self.target_network.set_session(sess)

View File

@@ -0,0 +1,664 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import time
from typing import List
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters, DistributedTaskParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import force_list, squeeze_list
from rl_coach.architectures.architecture import Architecture
from rl_coach.core_types import GradientClippingMethod
def batchnorm_activation_dropout(input_layer, batchnorm, activation_function, dropout, dropout_rate, layer_idx):
layers = [input_layer]
# batchnorm
if batchnorm:
layers.append(
tf.layers.batch_normalization(layers[-1], name="batchnorm{}".format(layer_idx))
)
# activation
if activation_function:
layers.append(
activation_function(layers[-1], name="activation{}".format(layer_idx))
)
# dropout
if dropout:
layers.append(
tf.layers.dropout(layers[-1], dropout_rate, name="dropout{}".format(layer_idx))
)
# remove the input layer from the layers list
del layers[0]
return layers
class Conv2d(object):
def __init__(self, params: List):
"""
:param params: list of [num_filters, kernel_size, strides]
"""
self.params = params
def __call__(self, input_layer, name: str):
"""
returns a tensorflow conv2d layer
:param input_layer: previous layer
:param name: layer name
:return: conv2d layer
"""
return tf.layers.conv2d(input_layer, filters=self.params[0], kernel_size=self.params[1], strides=self.params[2],
data_format='channels_last', name=name)
class Dense(object):
def __init__(self, params: List):
"""
:param params: list of [num_output_neurons]
"""
self.params = params
def __call__(self, input_layer, name: str):
"""
returns a tensorflow dense layer
:param input_layer: previous layer
:param name: layer name
:return: dense layer
"""
return tf.layers.dense(input_layer, self.params[0], name=name)
def variable_summaries(var):
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
with tf.name_scope('summaries'):
layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2]
with tf.name_scope(layer_weight_name):
mean = tf.reduce_mean(var)
tf.summary.scalar('mean', mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
tf.summary.scalar('stddev', stddev)
tf.summary.scalar('max', tf.reduce_max(var))
tf.summary.scalar('min', tf.reduce_min(var))
tf.summary.histogram('histogram', var)
def local_getter(getter, name, *args, **kwargs):
"""
This is a wrapper around the tf.get_variable function which puts the variables in the local variables collection
instead of the global variables collection. The local variables collection will hold variables which are not shared
between workers. these variables are also assumed to be non-trainable (the optimizer does not apply gradients to
these variables), but we can calculate the gradients wrt these variables, and we can update their content.
"""
kwargs['collections'] = [tf.GraphKeys.LOCAL_VARIABLES]
return getter(name, *args, **kwargs)
class TensorFlowArchitecture(Architecture):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= "",
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
"""
:param agent_parameters: the agent parameters
:param spaces: the spaces definition of the agent
:param name: the name of the network
:param global_network: the global network replica that is shared between all the workers
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
:param network_is_trainable: is the network trainable (we can apply gradients on it)
"""
super().__init__(agent_parameters, spaces, name)
self.middleware = None
self.network_is_local = network_is_local
self.global_network = global_network
if not self.network_parameters.tensorflow_support:
raise ValueError('TensorFlow is not supported for this agent')
self.sess = None
self.inputs = {}
self.outputs = []
self.targets = []
self.importance_weights = []
self.losses = []
self.total_loss = None
self.trainable_weights = []
self.weights_placeholders = []
self.shared_accumulated_gradients = []
self.curr_rnn_c_in = None
self.curr_rnn_h_in = None
self.gradients_wrt_inputs = []
self.train_writer = None
self.accumulated_gradients = None
self.network_is_trainable = network_is_trainable
self.is_chief = self.ap.task_parameters.task_index == 0
self.network_is_global = not self.network_is_local and global_network is None
self.distributed_training = self.network_is_global or self.network_is_local and global_network is not None
self.optimizer_type = self.network_parameters.optimizer_type
if self.ap.task_parameters.seed is not None:
tf.set_random_seed(self.ap.task_parameters.seed)
with tf.variable_scope("/".join(self.name.split("/")[1:]), initializer=tf.contrib.layers.xavier_initializer(),
custom_getter=local_getter if network_is_local and global_network else None):
self.global_step = tf.train.get_or_create_global_step()
# build the network
self.get_model()
# model weights
self.weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.full_name)
# create the placeholder for the assigning gradients and some tensorboard summaries for the weights
for idx, var in enumerate(self.weights):
placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder')
self.weights_placeholders.append(placeholder)
if self.ap.visualization.tensorboard:
variable_summaries(var)
# create op for assigning a list of weights to the network weights
self.update_weights_from_list = [weights.assign(holder) for holder, weights in
zip(self.weights_placeholders, self.weights)]
# locks for synchronous training
if self.network_is_global:
self._create_locks_for_synchronous_training()
# gradients ops
self._create_gradient_ops()
# L2 regularization
if self.network_parameters.l2_regularization != 0:
self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.weights])
* self.network_parameters.l2_regularization]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization)
self.inc_step = self.global_step.assign_add(1)
# reset LSTM hidden cells
self.reset_internal_memory()
if self.ap.visualization.tensorboard:
current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
scope=tf.contrib.framework.get_name_scope())
self.merged = tf.summary.merge(current_scope_summaries)
# initialize or restore model
self.init_op = tf.group(
tf.global_variables_initializer(),
tf.local_variables_initializer()
)
# set the fetches for training
self._set_initial_fetch_list()
def _set_initial_fetch_list(self):
"""
Create an initial list of tensors to fetch in each training iteration
:return: None
"""
self.train_fetches = [self.gradients_norm]
if self.network_parameters.clip_gradients:
self.train_fetches.append(self.clipped_grads)
else:
self.train_fetches.append(self.tensor_gradients)
self.train_fetches += [self.total_loss, self.losses]
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
self.train_fetches.append(self.middleware.state_out)
self.additional_fetches_start_idx = len(self.train_fetches)
def _create_locks_for_synchronous_training(self):
"""
Create locks for synchronizing the different workers during training
:return: None
"""
self.lock_counter = tf.get_variable("lock_counter", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
self.lock = self.lock_counter.assign_add(1, use_locking=True)
self.lock_init = self.lock_counter.assign(0)
self.release_counter = tf.get_variable("release_counter", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
self.release = self.release_counter.assign_add(1, use_locking=True)
self.release_decrement = self.release_counter.assign_add(-1, use_locking=True)
self.release_init = self.release_counter.assign(0)
def _create_gradient_ops(self):
"""
Create all the tensorflow operations for calculating gradients, processing the gradients and applying them
:return: None
"""
self.tensor_gradients = tf.gradients(self.total_loss, self.weights)
self.gradients_norm = tf.global_norm(self.tensor_gradients)
# gradient clipping
if self.network_parameters.clip_gradients is not None and self.network_parameters.clip_gradients != 0:
self._create_gradient_clipping_ops()
# when using a shared optimizer, we create accumulators to store gradients from all the workers before
# applying them
if self.distributed_training:
self._create_gradient_accumulators()
# gradients of the outputs w.r.t. the inputs
# at the moment, this is only used by ddpg
self.gradients_wrt_inputs = [{name: tf.gradients(output, input_ph) for name, input_ph in
self.inputs.items()} for output in self.outputs]
self.gradients_weights_ph = [tf.placeholder('float32', self.outputs[i].shape, 'output_gradient_weights')
for i in range(len(self.outputs))]
self.weighted_gradients = []
for i in range(len(self.outputs)):
unnormalized_gradients = tf.gradients(self.outputs[i], self.weights, self.gradients_weights_ph[i])
# unnormalized gradients seems to be better at the time. TODO: validate this accross more environments
# self.weighted_gradients.append(list(map(lambda x: tf.div(x, self.network_parameters.batch_size),
# unnormalized_gradients)))
self.weighted_gradients.append(unnormalized_gradients)
# defining the optimization process (for LBFGS we have less control over the optimizer)
if self.optimizer_type != 'LBFGS' and self.network_is_trainable:
self._create_gradient_applying_ops()
def _create_gradient_accumulators(self):
if self.network_is_global:
self.shared_accumulated_gradients = [tf.Variable(initial_value=tf.zeros_like(var)) for var in self.weights]
self.accumulate_shared_gradients = [var.assign_add(holder, use_locking=True) for holder, var in
zip(self.weights_placeholders, self.shared_accumulated_gradients)]
self.init_shared_accumulated_gradients = [var.assign(tf.zeros_like(var)) for var in
self.shared_accumulated_gradients]
elif self.network_is_local:
self.accumulate_shared_gradients = self.global_network.accumulate_shared_gradients
self.init_shared_accumulated_gradients = self.global_network.init_shared_accumulated_gradients
def _create_gradient_clipping_ops(self):
"""
Create tensorflow ops for clipping the gradients according to the given GradientClippingMethod
:return: None
"""
if self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByGlobalNorm:
self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients,
self.network_parameters.clip_gradients)
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByValue:
self.clipped_grads = [tf.clip_by_value(grad,
-self.network_parameters.clip_gradients,
self.network_parameters.clip_gradients)
for grad in self.tensor_gradients]
elif self.network_parameters.gradients_clipping_method == GradientClippingMethod.ClipByNorm:
self.clipped_grads = [tf.clip_by_norm(grad, self.network_parameters.clip_gradients)
for grad in self.tensor_gradients]
def _create_gradient_applying_ops(self):
"""
Create tensorflow ops for applying the gradients to the network weights according to the training scheme
(distributed training - local or global network, shared optimizer, etc.)
:return: None
"""
if self.network_is_global and self.network_parameters.shared_optimizer and \
not self.network_parameters.async_training:
# synchronous training with shared optimizer? -> create an operation for applying the gradients
# accumulated in the shared gradients accumulator
self.update_weights_from_shared_gradients = self.optimizer.apply_gradients(
zip(self.shared_accumulated_gradients, self.weights),
global_step=self.global_step)
elif self.distributed_training and self.network_is_local:
# distributed training but independent optimizer? -> create an operation for applying the gradients
# to the global weights
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
zip(self.weights_placeholders, self.global_network.weights), global_step=self.global_step)
elif self.network_is_trainable:
# not any of the above but is trainable? -> create an operation for applying the gradients to
# this network weights
self.update_weights_from_batch_gradients = self.optimizer.apply_gradients(
zip(self.weights_placeholders, self.weights), global_step=self.global_step)
def set_session(self, sess):
self.sess = sess
task_is_distributed = isinstance(self.ap.task_parameters, DistributedTaskParameters)
# initialize the session parameters in single threaded runs. Otherwise, this is done through the
# MonitoredSession object in the graph manager
if not task_is_distributed:
self.sess.run(self.init_op)
if self.ap.visualization.tensorboard:
# Write the merged summaries to the current experiment directory
if not task_is_distributed:
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path + '/tensorboard')
self.train_writer.add_graph(self.sess.graph)
elif self.network_is_local:
self.train_writer = tf.summary.FileWriter(self.ap.task_parameters.experiment_path +
'/tensorboard/worker{}'.format(self.ap.task_parameters.task_index))
self.train_writer.add_graph(self.sess.graph)
# wait for all the workers to set their session
if not self.network_is_local:
self.wait_for_all_workers_barrier()
def reset_accumulated_gradients(self):
"""
Reset the gradients accumulation placeholder
"""
if self.accumulated_gradients is None:
self.accumulated_gradients = self.sess.run(self.weights)
for ix, grad in enumerate(self.accumulated_gradients):
self.accumulated_gradients[ix] = grad * 0
def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None,
no_accumulation=False):
"""
Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation
placeholders
:param additional_fetches: Optional tensors to fetch during gradients calculation
:param inputs: The input batch for the network
:param targets: The targets corresponding to the input batch
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won't be scaled
:param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be
replaced by the newely calculated gradients instead of accumulating the new gradients.
This can speed up the function runtime by around 10%.
:return: A list containing the total loss and the individual network heads losses
"""
if self.accumulated_gradients is None:
self.reset_accumulated_gradients()
# feed inputs
if additional_fetches is None:
additional_fetches = []
feed_dict = self.create_feed_dict(inputs)
# feed targets
targets = force_list(targets)
for placeholder_idx, target in enumerate(targets):
feed_dict[self.targets[placeholder_idx]] = target
# feed importance weights
importance_weights = force_list(importance_weights)
for placeholder_idx, target_ph in enumerate(targets):
if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None:
importance_weight = np.ones(target_ph.shape[0])
else:
importance_weight = importance_weights[placeholder_idx]
importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1))
feed_dict[self.importance_weights[placeholder_idx]] = importance_weight
if self.optimizer_type != 'LBFGS':
# feed the lstm state if necessary
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
# we can't always assume that we are starting from scratch here can we?
feed_dict[self.middleware.c_in] = self.middleware.c_init
feed_dict[self.middleware.h_in] = self.middleware.h_init
fetches = self.train_fetches + additional_fetches
if self.ap.visualization.tensorboard:
fetches += [self.merged]
# get grads
result = self.sess.run(fetches, feed_dict=feed_dict)
if hasattr(self, 'train_writer') and self.train_writer is not None:
self.train_writer.add_summary(result[-1], self.sess.run(self.global_step))
# extract the fetches
norm_unclipped_grads, grads, total_loss, losses = result[:4]
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
fetched_tensors = []
if len(additional_fetches) > 0:
fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx +
len(additional_fetches)]
# accumulate the gradients
for idx, grad in enumerate(grads):
if no_accumulation:
self.accumulated_gradients[idx] = grad
else:
self.accumulated_gradients[idx] += grad
return total_loss, losses, norm_unclipped_grads, fetched_tensors
else:
self.optimizer.minimize(session=self.sess, feed_dict=feed_dict)
return [0]
def create_feed_dict(self, inputs):
feed_dict = {}
for input_name, input_value in inputs.items():
if isinstance(input_name, str):
if input_name not in self.inputs:
raise ValueError((
'input name {input_name} was provided to create a feed '
'dictionary, but there is no placeholder with that name. '
'placeholder names available include: {placeholder_names}'
).format(
input_name=input_name,
placeholder_names=', '.join(self.inputs.keys())
))
feed_dict[self.inputs[input_name]] = input_value
elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder':
feed_dict[input_name] = input_value
else:
raise ValueError((
'input dictionary expects strings or placeholders as keys, '
'but found key {key} of type {type}'
).format(
key=input_name,
type=type(input_name),
))
return feed_dict
def apply_and_reset_gradients(self, gradients, scaler=1.):
"""
Applies the given gradients to the network weights and resets the accumulation placeholder
:param gradients: The gradients to use for the update
:param scaler: A scaling factor that allows rescaling the gradients before applying them
"""
self.apply_gradients(gradients, scaler)
self.reset_accumulated_gradients()
def wait_for_all_workers_to_lock(self, lock: str, include_only_training_workers: bool=False):
"""
Waits for all the workers to lock a certain lock and then continues
:param lock: the name of the lock to use
:param include_only_training_workers: wait only for training workers or for all the workers?
:return: None
"""
if include_only_training_workers:
num_workers_to_wait_for = self.ap.task_parameters.num_training_tasks
else:
num_workers_to_wait_for = self.ap.task_parameters.num_tasks
# lock
if hasattr(self, '{}_counter'.format(lock)):
self.sess.run(getattr(self, lock))
while self.sess.run(getattr(self, '{}_counter'.format(lock))) % num_workers_to_wait_for != 0:
time.sleep(0.00001)
# self.sess.run(getattr(self, '{}_init'.format(lock)))
else:
raise ValueError("no counter was defined for the lock {}".format(lock))
def wait_for_all_workers_barrier(self, include_only_training_workers: bool=False):
"""
A barrier that allows waiting for all the workers to finish a certain block of commands
:param include_only_training_workers: wait only for training workers or for all the workers?
:return: None
"""
self.wait_for_all_workers_to_lock('lock', include_only_training_workers=include_only_training_workers)
self.sess.run(self.lock_init)
# we need to lock again (on a different lock) in order to prevent a situation where one of the workers continue
# and then was able to first increase the lock again by one, only to have a late worker to reset it again.
# so we want to make sure that all workers are done resetting the lock before continuting to reuse that lock.
self.wait_for_all_workers_to_lock('release', include_only_training_workers=include_only_training_workers)
self.sess.run(self.release_init)
def apply_gradients(self, gradients, scaler=1.):
"""
Applies the given gradients to the network weights
:param gradients: The gradients to use for the update
:param scaler: A scaling factor that allows rescaling the gradients before applying them.
The gradients will be MULTIPLIED by this factor
"""
if self.network_parameters.async_training or not isinstance(self.ap.task_parameters, DistributedTaskParameters):
if hasattr(self, 'global_step') and not self.network_is_local:
self.sess.run(self.inc_step)
if self.optimizer_type != 'LBFGS':
if self.distributed_training and not self.network_parameters.async_training:
# rescale the gradients so that they average out with the gradients from the other workers
if self.network_parameters.scale_down_gradients_by_number_of_workers_for_sync_training:
scaler /= float(self.ap.task_parameters.num_training_tasks)
# rescale the gradients
if scaler != 1.:
for gradient in gradients:
gradient *= scaler
# apply the gradients
feed_dict = dict(zip(self.weights_placeholders, gradients))
if self.distributed_training and self.network_parameters.shared_optimizer \
and not self.network_parameters.async_training:
# synchronous distributed training with shared optimizer:
# - each worker adds its gradients to the shared gradients accumulators
# - we wait for all the workers to add their gradients
# - the chief worker (worker with task index = 0) applies the gradients once and resets the accumulators
self.sess.run(self.accumulate_shared_gradients, feed_dict=feed_dict)
self.wait_for_all_workers_barrier(include_only_training_workers=True)
if self.is_chief:
self.sess.run(self.update_weights_from_shared_gradients)
self.sess.run(self.init_shared_accumulated_gradients)
else:
# async distributed training / distributed training with independent optimizer
# / non-distributed training - just apply the gradients
feed_dict = dict(zip(self.weights_placeholders, gradients))
self.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict)
# release barrier
if self.distributed_training and not self.network_parameters.async_training:
self.wait_for_all_workers_barrier(include_only_training_workers=True)
def predict(self, inputs, outputs=None, squeeze_output=True, initial_feed_dict=None):
"""
Run a forward pass of the network using the given input
:param inputs: The input for the network
:param outputs: The output for the network, defaults to self.outputs
:param squeeze_output: call squeeze_list on output
:param initial_feed_dict: a dictionary to use as the initial feed_dict. other inputs will be added to this dict
:return: The network output
WARNING: must only call once per state since each call is assumed by LSTM to be a new time step.
"""
feed_dict = self.create_feed_dict(inputs)
if initial_feed_dict:
feed_dict.update(initial_feed_dict)
if outputs is None:
outputs = self.outputs
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
feed_dict[self.middleware.c_in] = self.curr_rnn_c_in
feed_dict[self.middleware.h_in] = self.curr_rnn_h_in
output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.sess.run([outputs, self.middleware.state_out],
feed_dict=feed_dict)
else:
output = self.sess.run(outputs, feed_dict)
if squeeze_output:
output = squeeze_list(output)
return output
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None):
"""
Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients
:param additional_fetches: Optional tensors to fetch during the training process
:param inputs: The input for the network
:param targets: The targets corresponding to the input batch
:param scaler: A scaling factor that allows rescaling the gradients before applying them
:param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss
error of this sample. If it is not given, the samples losses won't be scaled
:return: The loss of the network
"""
if additional_fetches is None:
additional_fetches = []
force_list(additional_fetches)
loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches,
importance_weights=importance_weights)
self.apply_and_reset_gradients(self.accumulated_gradients, scaler)
return loss
def get_weights(self):
"""
:return: a list of tensors containing the network weights for each layer
"""
return self.weights
def set_weights(self, weights, new_rate=1.0):
"""
Sets the network weights from the given list of weights tensors
"""
feed_dict = {}
old_weights, new_weights = self.sess.run([self.get_weights(), weights])
for placeholder_idx, new_weight in enumerate(new_weights):
feed_dict[self.weights_placeholders[placeholder_idx]]\
= new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx]
self.sess.run(self.update_weights_from_list, feed_dict)
def get_variable_value(self, variable):
"""
Get the value of a variable from the graph
:param variable: the variable
:return: the value of the variable
"""
return self.sess.run(variable)
def set_variable_value(self, assign_op, value, placeholder=None):
"""
Updates the value of a variable.
This requires having an assign operation for the variable, and a placeholder which will provide the value
:param assign_op: an assign operation for the variable
:param value: a value to set the variable to
:param placeholder: a placeholder to hold the given value for injecting it into the variable
"""
self.sess.run(assign_op, feed_dict={placeholder: value})
def reset_internal_memory(self):
"""
Reset any internal memory used by the network. For example, an LSTM internal state
:return: None
"""
# initialize LSTM hidden states
if self.middleware.__class__.__name__ == 'LSTMMiddleware':
self.curr_rnn_c_in = self.middleware.c_init
self.curr_rnn_h_in = self.middleware.h_init

View File

@@ -0,0 +1,102 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Tuple
import tensorflow as tf
def create_cluster_spec(parameters_server: str, workers: str) -> tf.train.ClusterSpec:
"""
Creates a ClusterSpec object representing the cluster.
:param parameters_server: comma-separated list of hostname:port pairs to which the parameter servers are assigned
:param workers: comma-separated list of hostname:port pairs to which the workers are assigned
:return: a ClusterSpec object representing the cluster
"""
# extract the parameter servers and workers from the given strings
ps_hosts = parameters_server.split(",")
worker_hosts = workers.split(",")
# Create a cluster spec from the parameter server and worker hosts
cluster_spec = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
return cluster_spec
def create_and_start_parameters_server(cluster_spec: tf.train.ClusterSpec, config: tf.ConfigProto=None) -> None:
"""
Create and start a parameter server
:param cluster_spec: the ClusterSpec object representing the cluster
:param config: the tensorflow config to use
:return: None
"""
# create a server object for the parameter server
server = tf.train.Server(cluster_spec, job_name="ps", task_index=0, config=config)
# wait for the server to finish
server.join()
def create_worker_server_and_device(cluster_spec: tf.train.ClusterSpec, task_index: int,
use_cpu: bool=True, config: tf.ConfigProto=None) -> Tuple[str, tf.device]:
"""
Creates a worker server and a device setter used to assign the workers operations to
:param cluster_spec: a ClusterSpec object representing the cluster
:param task_index: the index of the worker task
:param use_cpu: if use_cpu=True, all the agent operations will be assigned to a CPU instead of a GPU
:param config: the tensorflow config to use
:return: the target string for the tf.Session and the worker device setter object
"""
# Create and start a worker
server = tf.train.Server(cluster_spec, job_name="worker", task_index=task_index, config=config)
# Assign ops to the local worker
worker_device = "/job:worker/task:{}".format(task_index)
if use_cpu:
worker_device += "/cpu:0"
else:
worker_device += "/device:GPU:0"
device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster_spec)
return server.target, device
def create_monitored_session(target: tf.train.Server, task_index: int,
checkpoint_dir: str, save_checkpoint_secs: int, config: tf.ConfigProto=None) -> tf.Session:
"""
Create a monitored session for the worker
:param target: the target string for the tf.Session
:param task_index: the task index of the worker
:param checkpoint_dir: a directory path where the checkpoints will be stored
:param save_checkpoint_secs: number of seconds between checkpoints storing
:param config: the tensorflow configuration (optional)
:return: the session to use for the run
"""
# we chose the first task to be the chief
is_chief = task_index == 0
# Create the monitored session
sess = tf.train.MonitoredTrainingSession(
master=target,
is_chief=is_chief,
hooks=[],
checkpoint_dir=checkpoint_dir,
save_checkpoint_secs=save_checkpoint_secs,
config=config
)
return sess

View File

@@ -0,0 +1,114 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List, Union
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.shared_variables import SharedRunningStats
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.core_types import InputEmbedding
class InputEmbedder(object):
"""
An input embedder is the first part of the network, which takes the input from the state and produces a vector
embedding by passing it through a neural network. The embedder will mostly be input type dependent, and there
can be multiple embedders in a single network
"""
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=None, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling=1.0, input_offset=0.0, input_clipping=None):
self.name = name
self.input_size = input_size
self.activation_function = activation_function
self.batchnorm = batchnorm
self.dropout = dropout
self.dropout_rate = 0
self.input = None
self.output = None
self.scheme = scheme
self.return_type = InputEmbedding
self.layers = []
self.input_rescaling = input_rescaling
self.input_offset = input_offset
self.input_clipping = input_clipping
def __call__(self, prev_input_placeholder=None):
with tf.variable_scope(self.get_name()):
if prev_input_placeholder is None:
self.input = tf.placeholder("float", shape=[None] + self.input_size, name=self.get_name())
else:
self.input = prev_input_placeholder
self._build_module()
return self.input, self.output
def _build_module(self):
# NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not
# to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the
# input to the network to be float, which is 4x more expensive in memory.
# thus causing each saved transition in the memory to also be 4x more pricier.
input_layer = self.input / self.input_rescaling
input_layer -= self.input_offset
# clip input using te given range
if self.input_clipping is not None:
input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1])
self.layers.append(input_layer)
# layers order is conv -> batchnorm -> activation -> dropout
if isinstance(self.scheme, EmbedderScheme):
layers_params = self.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
self.output = tf.contrib.layers.flatten(self.layers[-1])
@property
def input_size(self) -> List[int]:
return self._input_size
@input_size.setter
def input_size(self, value: Union[int, List[int]]):
if isinstance(value, np.ndarray) or isinstance(value, tuple):
value = list(value)
elif isinstance(value, int):
value = [value]
if not isinstance(value, list):
raise ValueError((
'input_size expected to be a list, found {value} which has type {type}'
).format(value=value, type=type(value)))
self._input_size = value
@property
def schemes(self):
raise NotImplementedError("Inheriting embedder must define schemes matching its allowed default "
"configurations.")
def get_name(self):
return self.name

View File

@@ -0,0 +1,74 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import Conv2d
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
from rl_coach.core_types import InputImageEmbedding
class ImageEmbedder(InputEmbedder):
"""
An input embedder that performs convolutions on the input and then flattens the result.
The embedder is intended for image like inputs, where the channels are expected to be the last axis.
The embedder also allows custom rescaling of the input prior to the neural network.
"""
schemes = {
EmbedderScheme.Empty:
[],
EmbedderScheme.Shallow:
[
Conv2d([32, 3, 1])
],
# atari dqn
EmbedderScheme.Medium:
[
Conv2d([32, 8, 4]),
Conv2d([64, 4, 2]),
Conv2d([64, 3, 1])
],
# carla
EmbedderScheme.Deep: \
[
Conv2d([32, 5, 2]),
Conv2d([32, 3, 1]),
Conv2d([64, 3, 2]),
Conv2d([64, 3, 1]),
Conv2d([128, 3, 2]),
Conv2d([128, 3, 1]),
Conv2d([256, 3, 2]),
Conv2d([256, 3, 1])
]
}
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling: float=255.0, input_offset: float=0.0, input_clipping=None):
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling,
input_offset, input_clipping)
self.return_type = InputImageEmbedding
if len(input_size) != 3 and scheme != EmbedderScheme.Empty:
raise ValueError("Image embedders expect the input size to have 3 dimensions. The given size is: {}"
.format(input_size))

View File

@@ -0,0 +1,64 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import Dense
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.embedders.embedder import InputEmbedder
from rl_coach.core_types import InputVectorEmbedding
class VectorEmbedder(InputEmbedder):
"""
An input embedder that is intended for inputs that can be represented as vectors.
The embedder flattens the input, applies several dense layers to it and returns the output.
"""
schemes = {
EmbedderScheme.Empty:
[],
EmbedderScheme.Shallow:
[
Dense([128])
],
# dqn
EmbedderScheme.Medium:
[
Dense([256])
],
# carla
EmbedderScheme.Deep: \
[
Dense([128]),
Dense([128]),
Dense([128])
]
}
def __init__(self, input_size: List[int], activation_function=tf.nn.relu,
scheme: EmbedderScheme=EmbedderScheme.Medium, batchnorm: bool=False, dropout: bool=False,
name: str= "embedder", input_rescaling: float=1.0, input_offset:float=0.0, input_clipping=None):
super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name,
input_rescaling, input_offset, input_clipping)
self.return_type = InputVectorEmbedding
if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty:
raise ValueError("The input size of a vector embedder must contain only a single dimension")

View File

@@ -0,0 +1,344 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from typing import Dict
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.architectures.tensorflow_components.middlewares.middleware import MiddlewareParameters
from rl_coach.base_parameters import AgentParameters, InputEmbedderParameters, EmbeddingMergerType
from rl_coach.spaces import SpacesDefinition, PlanarMapsObservationSpace
from rl_coach.utils import get_all_subclasses, dynamic_import_and_instantiate_module_from_params
from rl_coach.architectures.tensorflow_components.architecture import TensorFlowArchitecture
from rl_coach.core_types import PredictionType
class GeneralTensorFlowNetwork(TensorFlowArchitecture):
"""
A generalized version of all possible networks implemented using tensorflow.
"""
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str,
global_network=None, network_is_local: bool=True, network_is_trainable: bool=False):
"""
:param agent_parameters: the agent parameters
:param spaces: the spaces definition of the agent
:param name: the name of the network
:param global_network: the global network replica that is shared between all the workers
:param network_is_local: is the network global (shared between workers) or local (dedicated to the worker)
:param network_is_trainable: is the network trainable (we can apply gradients on it)
"""
self.global_network = global_network
self.network_is_local = network_is_local
self.network_wrapper_name = name.split('/')[0]
self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
self.num_heads_per_network = 1 if self.network_parameters.use_separate_networks_per_head else \
len(self.network_parameters.heads_parameters)
self.num_networks = 1 if not self.network_parameters.use_separate_networks_per_head else \
len(self.network_parameters.heads_parameters)
self.gradients_from_head_rescalers = []
self.gradients_from_head_rescalers_placeholders = []
self.update_head_rescaler_value_ops = []
self.adaptive_learning_rate_scheme = None
self.current_learning_rate = None
# init network modules containers
self.input_embedders = []
self.output_heads = []
super().__init__(agent_parameters, spaces, name, global_network,
network_is_local, network_is_trainable)
def fill_return_types():
ret_dict = {}
for cls in get_all_subclasses(PredictionType):
ret_dict[cls] = []
components = self.input_embedders + [self.middleware] + self.output_heads
for component in components:
if not hasattr(component, 'return_type'):
raise ValueError("{} has no return_type attribute. This should not happen.")
if component.return_type is not None:
ret_dict[component.return_type].append(component)
return ret_dict
self.available_return_types = fill_return_types()
def predict_with_prediction_type(self, states: Dict[str, np.ndarray],
prediction_type: PredictionType) -> Dict[str, np.ndarray]:
"""
Search for a component[s] which has a return_type set to the to the requested PredictionType, and get
predictions for it.
:param states: The input states to the network.
:param prediction_type: The requested PredictionType to look for in the network components
:return: A dictionary with predictions for all components matching the requested prediction type
"""
ret_dict = {}
for component in self.available_return_types[prediction_type]:
ret_dict[component] = self.predict(inputs=states, outputs=component.output)
return ret_dict
@staticmethod
def get_activation_function(activation_function_string: str):
"""
Map the activation function from a string to the tensorflow framework equivalent
:param activation_function_string: the type of the activation function
:return: the tensorflow activation function
"""
activation_functions = {
'relu': tf.nn.relu,
'tanh': tf.nn.tanh,
'sigmoid': tf.nn.sigmoid,
'elu': tf.nn.elu,
'selu': tf.nn.selu,
'leaky_relu': tf.nn.leaky_relu,
'none': None
}
assert activation_function_string in activation_functions.keys(), \
"Activation function must be one of the following {}. instead it was: {}"\
.format(activation_functions.keys(), activation_function_string)
return activation_functions[activation_function_string]
def get_input_embedder(self, input_name: str, embedder_params: InputEmbedderParameters):
"""
Given an input embedder parameters class, creates the input embedder and returns it
:param input_name: the name of the input to the embedder (used for retrieving the shape). The input should
be a value within the state or the action.
:param embedder_params: the parameters of the class of the embedder
:return: the embedder instance
"""
allowed_inputs = copy.copy(self.spaces.state.sub_spaces)
allowed_inputs["action"] = copy.copy(self.spaces.action)
allowed_inputs["goal"] = copy.copy(self.spaces.goal)
if input_name not in allowed_inputs.keys():
raise ValueError("The key for the input embedder ({}) must match one of the following keys: {}"
.format(input_name, allowed_inputs.keys()))
type = "vector"
if isinstance(allowed_inputs[input_name], PlanarMapsObservationSpace):
type = "image"
embedder_path = 'rl_coach.architectures.tensorflow_components.embedders.' + embedder_params.path[type]
embedder_params_copy = copy.copy(embedder_params)
embedder_params_copy.activation_function = self.get_activation_function(embedder_params.activation_function)
embedder_params_copy.input_rescaling = embedder_params_copy.input_rescaling[type]
embedder_params_copy.input_offset = embedder_params_copy.input_offset[type]
embedder_params_copy.name = input_name
module = dynamic_import_and_instantiate_module_from_params(embedder_params_copy,
path=embedder_path,
positional_args=[allowed_inputs[input_name].shape])
return module
def get_middleware(self, middleware_params: MiddlewareParameters):
"""
Given a middleware type, creates the middleware and returns it
:param middleware_params: the paramaeters of the middleware class
:return: the middleware instance
"""
middleware_params_copy = copy.copy(middleware_params)
middleware_params_copy.activation_function = self.get_activation_function(middleware_params.activation_function)
module = dynamic_import_and_instantiate_module_from_params(middleware_params_copy)
return module
def get_output_head(self, head_params: HeadParameters, head_idx: int, loss_weight: float=1.):
"""
Given a head type, creates the head and returns it
:param head_params: the parameters of the head to create
:param head_type: the path to the class of the head under the embedders directory or a full path to a head class.
the path should be in the following structure: <module_path>:<class_path>
:param head_idx: the head index
:param loss_weight: the weight to assign for the embedders loss
:return: the head
"""
head_params_copy = copy.copy(head_params)
head_params_copy.activation_function = self.get_activation_function(head_params_copy.activation_function)
return dynamic_import_and_instantiate_module_from_params(head_params_copy, extra_kwargs={
'agent_parameters': self.ap, 'spaces': self.spaces, 'network_name': self.network_wrapper_name,
'head_idx': head_idx, 'loss_weight': loss_weight, 'is_local': self.network_is_local})
def get_model(self):
# validate the configuration
if len(self.network_parameters.input_embedders_parameters) == 0:
raise ValueError("At least one input type should be defined")
if len(self.network_parameters.heads_parameters) == 0:
raise ValueError("At least one output type should be defined")
if self.network_parameters.middleware_parameters is None:
raise ValueError("Exactly one middleware type should be defined")
if len(self.network_parameters.loss_weights) == 0:
raise ValueError("At least one loss weight should be defined")
if len(self.network_parameters.heads_parameters) != len(self.network_parameters.loss_weights):
raise ValueError("Number of loss weights should match the number of output types")
for network_idx in range(self.num_networks):
with tf.variable_scope('network_{}'.format(network_idx)):
####################
# Input Embeddings #
####################
state_embedding = []
for input_name in sorted(self.network_parameters.input_embedders_parameters):
input_type = self.network_parameters.input_embedders_parameters[input_name]
# get the class of the input embedder
input_embedder = self.get_input_embedder(input_name, input_type)
self.input_embedders.append(input_embedder)
# input placeholders are reused between networks. on the first network, store the placeholders
# generated by the input_embedders in self.inputs. on the rest of the networks, pass
# the existing input_placeholders into the input_embedders.
if network_idx == 0:
input_placeholder, embedding = input_embedder()
self.inputs[input_name] = input_placeholder
else:
input_placeholder, embedding = input_embedder(self.inputs[input_name])
state_embedding.append(embedding)
##########
# Merger #
##########
if len(state_embedding) == 1:
state_embedding = state_embedding[0]
else:
if self.network_parameters.embedding_merger_type == EmbeddingMergerType.Concat:
state_embedding = tf.concat(state_embedding, axis=-1, name="merger")
elif self.network_parameters.embedding_merger_type == EmbeddingMergerType.Sum:
state_embedding = tf.add_n(state_embedding, name="merger")
##############
# Middleware #
##############
self.middleware = self.get_middleware(self.network_parameters.middleware_parameters)
_, self.state_embedding = self.middleware(state_embedding)
################
# Output Heads #
################
head_count = 0
for head_idx in range(self.num_heads_per_network):
for head_copy_idx in range(self.network_parameters.num_output_head_copies):
if self.network_parameters.use_separate_networks_per_head:
# if we use separate networks per head, then the head type corresponds top the network idx
head_type_idx = network_idx
head_count = network_idx
else:
# if we use a single network with multiple embedders, then the head type is the current head idx
head_type_idx = head_idx
self.output_heads.append(
self.get_output_head(self.network_parameters.heads_parameters[head_type_idx],
head_copy_idx,
self.network_parameters.loss_weights[head_type_idx])
)
# rescale the gradients from the head
self.gradients_from_head_rescalers.append(
tf.get_variable('gradients_from_head_{}-{}_rescalers'.format(head_idx, head_copy_idx),
initializer=float(
self.network_parameters.rescale_gradient_from_head_by_factor[head_count]
),
dtype=tf.float32))
self.gradients_from_head_rescalers_placeholders.append(
tf.placeholder('float',
name='gradients_from_head_{}-{}_rescalers'.format(head_type_idx, head_copy_idx)))
self.update_head_rescaler_value_ops.append(self.gradients_from_head_rescalers[head_count].assign(
self.gradients_from_head_rescalers_placeholders[head_count]))
head_input = (1-self.gradients_from_head_rescalers[head_count]) * tf.stop_gradient(self.state_embedding) + \
self.gradients_from_head_rescalers[head_count] * self.state_embedding
# build the head
if self.network_is_local:
output, target_placeholder, input_placeholders, importance_weight_ph = \
self.output_heads[-1](head_input)
self.targets.extend(target_placeholder)
self.importance_weights.extend(importance_weight_ph)
else:
output, input_placeholders = self.output_heads[-1](head_input)
self.outputs.extend(output)
# TODO: use head names as well
for placeholder_index, input_placeholder in enumerate(input_placeholders):
self.inputs['output_{}_{}'.format(head_type_idx, placeholder_index)] = input_placeholder
head_count += 1
# Losses
self.losses = tf.losses.get_losses(self.full_name)
self.losses += tf.losses.get_regularization_losses(self.full_name)
self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.full_name)
# tf.summary.scalar('total_loss', self.total_loss)
# Learning rate
if self.network_parameters.learning_rate_decay_rate != 0:
self.adaptive_learning_rate_scheme = \
tf.train.exponential_decay(
self.network_parameters.learning_rate,
self.global_step,
decay_steps=self.network_parameters.learning_rate_decay_steps,
decay_rate=self.network_parameters.learning_rate_decay_rate,
staircase=True)
self.current_learning_rate = self.adaptive_learning_rate_scheme
else:
self.current_learning_rate = self.network_parameters.learning_rate
# Optimizer
if self.distributed_training and self.network_is_local and self.network_parameters.shared_optimizer:
# distributed training + is a local network + optimizer shared -> take the global optimizer
self.optimizer = self.global_network.optimizer
elif (self.distributed_training and self.network_is_local and not self.network_parameters.shared_optimizer) \
or self.network_parameters.shared_optimizer or not self.distributed_training:
# distributed training + is a global network + optimizer shared
# OR
# distributed training + is a local network + optimizer not shared
# OR
# non-distributed training
# -> create an optimizer
if self.network_parameters.optimizer_type == 'Adam':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.current_learning_rate,
beta1=self.network_parameters.adam_optimizer_beta1,
beta2=self.network_parameters.adam_optimizer_beta2,
epsilon=self.network_parameters.optimizer_epsilon)
elif self.network_parameters.optimizer_type == 'RMSProp':
self.optimizer = tf.train.RMSPropOptimizer(self.current_learning_rate,
decay=self.network_parameters.rms_prop_optimizer_decay,
epsilon=self.network_parameters.optimizer_epsilon)
elif self.network_parameters.optimizer_type == 'LBFGS':
self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B',
options={'maxiter': 25})
else:
raise Exception("{} is not a valid optimizer type".format(self.network_parameters.optimizer_type))

View File

@@ -0,0 +1,54 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class CategoricalQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)
class CategoricalQHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'categorical_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms
self.return_type = QActionStateValue
def _build_module(self, input_layer):
self.actions = tf.placeholder(tf.int32, [None], name="actions")
self.input = [self.actions]
values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
self.num_atoms))
# softmax on atoms dimension
self.output = tf.nn.softmax(values_distribution)
# calculate cross entropy loss
self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
name="distributions")
self.target = self.distributions
self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,66 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.core_types import ActionProbabilities
class DDPGActorHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params', batchnorm: bool=True):
super().__init__(parameterized_class=DDPGActor, activation_function=activation_function, name=name)
self.batchnorm = batchnorm
class DDPGActor(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh',
batchnorm: bool=True):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ddpg_actor_head'
self.return_type = ActionProbabilities
self.num_actions = self.spaces.action.shape
self.batchnorm = batchnorm
# bounded actions
self.output_scale = self.spaces.action.max_abs_range
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
if hasattr(agent_parameters.algorithm, 'action_penalty'):
self.action_penalty = agent_parameters.algorithm.action_penalty
def _build_module(self, input_layer):
# mean
pre_activation_policy_values_mean = tf.layers.dense(input_layer, self.num_actions, name='fc_mean')
policy_values_mean = batchnorm_activation_dropout(pre_activation_policy_values_mean, self.batchnorm,
self.activation_function,
False, 0, 0)[-1]
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
if self.is_local:
# add a squared penalty on the squared pre-activation features of the action
if self.action_penalty and self.action_penalty != 0:
self.regularizations += \
[self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]
self.output = [self.policy_mean]

View File

@@ -0,0 +1,87 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
from rl_coach.spaces import SpacesDefinition
from rl_coach.memories.non_episodic import differentiable_neural_dictionary
class DNDQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='dnd_q_head_params'):
super().__init__(parameterized_class=DNDQHead, activation_function=activation_function, name=name)
class DNDQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'dnd_q_values_head'
self.DND_size = agent_parameters.algorithm.dnd_size
self.DND_key_error_threshold = agent_parameters.algorithm.DND_key_error_threshold
self.l2_norm_added_delta = agent_parameters.algorithm.l2_norm_added_delta
self.new_value_shift_coefficient = agent_parameters.algorithm.new_value_shift_coefficient
self.number_of_nn = agent_parameters.algorithm.number_of_knn
self.ap = agent_parameters
self.dnd_embeddings = [None] * self.num_actions
self.dnd_values = [None] * self.num_actions
self.dnd_indices = [None] * self.num_actions
self.dnd_distances = [None] * self.num_actions
if self.ap.memory.shared_memory:
self.shared_memory_scratchpad = self.ap.task_parameters.shared_memory_scratchpad
def _build_module(self, input_layer):
if hasattr(self.ap.task_parameters, 'checkpoint_restore_dir') and self.ap.task_parameters.checkpoint_restore_dir:
self.DND = differentiable_neural_dictionary.load_dnd(self.ap.task_parameters.checkpoint_restore_dir)
else:
self.DND = differentiable_neural_dictionary.QDND(
self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient,
key_error_threshold=self.DND_key_error_threshold,
learning_rate=self.network_parameters.learning_rate,
num_neighbors=self.number_of_nn,
override_existing_keys=True)
# Retrieve info from DND dictionary
# We assume that all actions have enough entries in the DND
self.output = tf.transpose([
self._q_value(input_layer, action)
for action in range(self.num_actions)
])
def _q_value(self, input_layer, action):
result = tf.py_func(self.DND.query,
[input_layer, action, self.number_of_nn],
[tf.float64, tf.float64, tf.int64])
self.dnd_embeddings[action] = tf.to_float(result[0])
self.dnd_values[action] = tf.to_float(result[1])
self.dnd_indices[action] = result[2]
# DND calculation
square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1))
distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta]
self.dnd_distances[action] = distances
weights = 1.0 / distances
normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True)
q_value = tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1)
q_value.set_shape((None,))
return q_value
def _post_build(self):
# DND gradients
self.dnd_embeddings_grad = tf.gradients(self.loss[0], self.dnd_embeddings)
self.dnd_values_grad = tf.gradients(self.loss[0], self.dnd_values)

View File

@@ -0,0 +1,50 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.architectures.tensorflow_components.heads.q_head import QHead
from rl_coach.spaces import SpacesDefinition
class DuelingQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='dueling_q_head_params'):
super().__init__(parameterized_class=DuelingQHead, activation_function=activation_function, name=name)
class DuelingQHead(QHead):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'dueling_q_values_head'
def _build_module(self, input_layer):
# state value tower - V
with tf.variable_scope("state_value"):
state_value = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
state_value = tf.layers.dense(state_value, 1, name='fc2')
# state_value = tf.expand_dims(state_value, axis=-1)
# action advantage tower - A
with tf.variable_scope("action_advantage"):
action_advantage = tf.layers.dense(input_layer, 512, activation=self.activation_function, name='fc1')
action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2')
action_advantage = action_advantage - tf.reduce_mean(action_advantage)
# merge to state-action value function Q
self.output = tf.add(state_value, action_advantage, name='output')

View File

@@ -0,0 +1,165 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Type
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters, Parameters
from rl_coach.spaces import SpacesDefinition
from tensorflow.python.ops.losses.losses_impl import Reduction
from rl_coach.utils import force_list
# Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
def _initializer(shape, dtype=None, partition_info=None):
out = np.random.randn(*shape).astype(np.float32)
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
return tf.constant(out)
return _initializer
class HeadParameters(Parameters):
def __init__(self, parameterized_class: Type['Head'], activation_function: str = 'relu', name: str= 'head'):
super().__init__()
self.activation_function = activation_function
self.name = name
self.parameterized_class_name = parameterized_class.__name__
class Head(object):
"""
A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through
a neural network to produce the output of the network. There can be multiple heads in a network, and each one has
an assigned loss function. The heads are algorithm dependent.
"""
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int=0, loss_weight: float=1., is_local: bool=True, activation_function: str='relu'):
self.head_idx = head_idx
self.network_name = network_name
self.network_parameters = agent_parameters.network_wrappers[self.network_name]
self.name = "head"
self.output = []
self.loss = []
self.loss_type = []
self.regularizations = []
self.loss_weight = force_list(loss_weight)
self.target = []
self.importance_weight = []
self.input = []
self.is_local = is_local
self.ap = agent_parameters
self.spaces = spaces
self.return_type = None
self.activation_function = activation_function
def __call__(self, input_layer):
"""
Wrapper for building the module graph including scoping and loss creation
:param input_layer: the input to the graph
:return: the output of the last layer and the target placeholder
"""
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
self._build_module(input_layer)
self.output = force_list(self.output)
self.target = force_list(self.target)
self.input = force_list(self.input)
self.loss_type = force_list(self.loss_type)
self.loss = force_list(self.loss)
self.regularizations = force_list(self.regularizations)
if self.is_local:
self.set_loss()
self._post_build()
if self.is_local:
return self.output, self.target, self.input, self.importance_weight
else:
return self.output, self.input
def _build_module(self, input_layer):
"""
Builds the graph of the module
This method is called early on from __call__. It is expected to store the graph
in self.output.
:param input_layer: the input to the graph
:return: None
"""
pass
def _post_build(self):
"""
Optional function that allows adding any extra definitions after the head has been fully defined
For example, this allows doing additional calculations that are based on the loss
:return: None
"""
pass
def get_name(self):
"""
Get a formatted name for the module
:return: the formatted name
"""
return '{}_{}'.format(self.name, self.head_idx)
def set_loss(self):
"""
Creates a target placeholder and loss function for each loss_type and regularization
:param loss_type: a tensorflow loss function
:param scope: the name scope to include the tensors in
:return: None
"""
# there are heads that define the loss internally, but we need to create additional placeholders for them
for idx in range(len(self.loss)):
importance_weight = tf.placeholder('float',
[None] + [1] * (len(self.target[idx].shape) - 1),
'{}_importance_weight'.format(self.get_name()))
self.importance_weight.append(importance_weight)
# add losses and target placeholder
for idx in range(len(self.loss_type)):
# create target placeholder
target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name()))
self.target.append(target)
# create importance sampling weights placeholder
num_target_dims = len(self.target[idx].shape)
importance_weight = tf.placeholder('float', [None] + [1] * (num_target_dims - 1),
'{}_importance_weight'.format(self.get_name()))
self.importance_weight.append(importance_weight)
# compute the weighted loss. importance_weight weights over the samples in the batch, while self.loss_weight
# weights the specific loss of this head against other losses in this head or in other heads
loss_weight = self.loss_weight[idx]*importance_weight
loss = self.loss_type[idx](self.target[-1], self.output[idx],
scope=self.get_name(), reduction=Reduction.NONE, loss_collection=None)
# the loss is first summed over each sample in the batch and then the mean over the batch is taken
loss = tf.reduce_mean(loss_weight*tf.reduce_sum(loss, axis=list(range(1, num_target_dims))))
# we add the loss to the losses collection and later we will extract it in general_network
tf.losses.add_loss(loss)
self.loss.append(loss)
# add regularizations
for regularization in self.regularizations:
self.loss.append(regularization)
@classmethod
def path(cls):
return cls.__class__.__name__

View File

@@ -0,0 +1,65 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import Measurements
class MeasurementsPredictionHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='measurements_prediction_head_params'):
super().__init__(parameterized_class=MeasurementsPredictionHead,
activation_function=activation_function, name=name)
class MeasurementsPredictionHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'future_measurements_head'
self.num_actions = len(self.spaces.action.actions)
self.num_measurements = self.spaces.state['measurements'].shape[0]
self.num_prediction_steps = agent_parameters.algorithm.num_predicted_steps_ahead
self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps
self.return_type = Measurements
def _build_module(self, input_layer):
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
# actions expectation tower (expectation stream) - E
with tf.variable_scope("expectation_stream"):
expectation_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output')
expectation_stream = tf.expand_dims(expectation_stream, axis=1)
# action fine differences tower (action stream) - A
with tf.variable_scope("action_stream"):
action_stream = tf.layers.dense(input_layer, 256, activation=self.activation_function, name='fc1')
action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size,
name='output')
action_stream = tf.reshape(action_stream,
(tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size))
action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keepdims=True)
# merge to future measurements predictions
self.output = tf.add(expectation_stream, action_stream, name='output')
self.target = tf.placeholder(tf.float32, [None, self.num_actions, self.multi_step_measurements_size],
name="targets")
targets_nonan = tf.where(tf.is_nan(self.target), self.output, self.target)
self.loss = tf.reduce_sum(tf.reduce_mean(tf.square(targets_nonan - self.output), reduction_indices=0))
tf.losses.add_loss(self.loss_weight[0] * self.loss)

View File

@@ -0,0 +1,88 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import BoxActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class NAFHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='naf_head_params'):
super().__init__(parameterized_class=NAFHead, activation_function=activation_function, name=name)
class NAFHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True,activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
if not isinstance(self.spaces.action, BoxActionSpace):
raise ValueError("NAF works only for continuous action spaces (BoxActionSpace)")
self.name = 'naf_q_values_head'
self.num_actions = self.spaces.action.shape[0]
self.output_scale = self.spaces.action.max_abs_range
self.return_type = QActionStateValue
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# NAF
self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action")
self.input = self.action
# V Head
self.V = tf.layers.dense(input_layer, 1, name='V')
# mu Head
mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=self.activation_function, name='mu_unscaled')
self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu')
# A Head
# l_vector is a vector that includes a lower-triangular matrix values
self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector')
# Convert l to a lower triangular matrix and exponentiate its diagonal
i = 0
columns = []
for col in range(self.num_actions):
start_row = col
num_non_zero_elements = self.num_actions - start_row
zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row])
diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1)
non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)]
columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1))
i += num_non_zero_elements
self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1))
# P = L*L^T
self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1)))
# A = -1/2 * (u - mu)^T * P * (u - mu)
action_diff = tf.expand_dims(self.action - self.mu, -1)
a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff))
self.A = tf.reshape(a_matrix_form, [-1, 1])
# Q Head
self.Q = tf.add(self.V, self.A, name='Q')
self.output = self.Q

View File

@@ -0,0 +1,151 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, CompoundActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
from rl_coach.core_types import ActionProbabilities
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
class PolicyHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='policy_head_params'):
super().__init__(parameterized_class=PolicyHead, activation_function=activation_function, name=name)
class PolicyHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'policy_values_head'
self.return_type = ActionProbabilities
self.beta = None
self.action_penalty = None
self.exploration_policy = agent_parameters.exploration
# a scalar weight that penalizes low entropy values to encourage exploration
if hasattr(agent_parameters.algorithm, 'beta_entropy'):
self.beta = agent_parameters.algorithm.beta_entropy
# a scalar weight that penalizes high activation values (before the activation function) for the final layer
if hasattr(agent_parameters.algorithm, 'action_penalty'):
self.action_penalty = agent_parameters.algorithm.action_penalty
def _build_module(self, input_layer):
self.actions = []
self.input = self.actions
self.policy_distributions = []
self.output = []
action_spaces = [self.spaces.action]
if isinstance(self.spaces.action, CompoundActionSpace):
action_spaces = self.spaces.action.sub_action_spaces
# create a compound action network
for action_space_idx, action_space in enumerate(action_spaces):
with tf.variable_scope("sub_action_{}".format(action_space_idx)):
if isinstance(action_space, DiscreteActionSpace):
# create a discrete action network (softmax probabilities output)
self._build_discrete_net(input_layer, action_space)
elif isinstance(action_space, BoxActionSpace):
# create a continuous action network (bounded mean and stdev outputs)
self._build_continuous_net(input_layer, action_space)
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.add_n([tf.reduce_mean(dist.entropy()) for dist in self.policy_distributions])
self.regularizations += [-tf.multiply(self.beta, self.entropy, name='entropy_regularization')]
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# calculate loss
self.action_log_probs_wrt_policy = \
tf.add_n([dist.log_prob(action) for dist, action in zip(self.policy_distributions, self.actions)])
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.target = self.advantages
self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages)
tf.losses.add_loss(self.loss_weight[0] * self.loss)
def _build_discrete_net(self, input_layer, action_space):
num_actions = len(action_space.actions)
self.actions.append(tf.placeholder(tf.int32, [None], name="actions"))
policy_values = tf.layers.dense(input_layer, num_actions, name='fc')
self.policy_probs = tf.nn.softmax(policy_values, name="policy")
# define the distributions for the policy and the old policy
# (the + eps is to prevent probability 0 which will cause the log later on to be -inf)
policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_probs + eps))
self.policy_distributions.append(policy_distribution)
self.output.append(self.policy_probs)
def _build_continuous_net(self, input_layer, action_space):
num_actions = action_space.shape
self.actions.append(tf.placeholder(tf.float32, [None, num_actions], name="actions"))
# output activation function
if np.all(self.spaces.action.max_abs_range < np.inf):
# bounded actions
self.output_scale = action_space.max_abs_range
self.continuous_output_activation = self.activation_function
else:
# unbounded actions
self.output_scale = 1
self.continuous_output_activation = None
# mean
pre_activation_policy_values_mean = tf.layers.dense(input_layer, num_actions, name='fc_mean')
policy_values_mean = self.continuous_output_activation(pre_activation_policy_values_mean)
self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean')
self.output.append(self.policy_mean)
# standard deviation
if isinstance(self.exploration_policy, ContinuousEntropyParameters):
# the stdev is an output of the network and uses a softplus activation as defined in A3C
policy_values_std = tf.layers.dense(input_layer, num_actions,
kernel_initializer=normalized_columns_initializer(0.01), name='fc_std')
self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps
self.output.append(self.policy_std)
else:
# the stdev is an externally given value
# Warning: we need to explicitly put this variable in the local variables collections, since defining
# it as not trainable puts it for some reason in the global variables collections. If this is not done,
# the variable won't be initialized and when working with multiple workers they will get stuck.
self.policy_std = tf.Variable(np.ones(num_actions), dtype='float32', trainable=False,
name='policy_stdev', collections=[tf.GraphKeys.LOCAL_VARIABLES])
# assign op for the policy std
self.policy_std_placeholder = tf.placeholder('float32', (num_actions,))
self.assign_policy_std = tf.assign(self.policy_std, self.policy_std_placeholder)
# define the distributions for the policy and the old policy
policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std)
self.policy_distributions.append(policy_distribution)
if self.is_local:
# add a squared penalty on the squared pre-activation features of the action
if self.action_penalty and self.action_penalty != 0:
self.regularizations += [
self.action_penalty * tf.reduce_mean(tf.square(pre_activation_policy_values_mean))]

View File

@@ -0,0 +1,144 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
from rl_coach.spaces import SpacesDefinition
from rl_coach.utils import eps
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters, normalized_columns_initializer
from rl_coach.core_types import ActionProbabilities
class PPOHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='tanh', name: str='ppo_head_params'):
super().__init__(parameterized_class=PPOHead, activation_function=activation_function, name=name)
class PPOHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='tanh'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ppo_head'
self.return_type = ActionProbabilities
# used in regular PPO
self.use_kl_regularization = agent_parameters.algorithm.use_kl_regularization
if self.use_kl_regularization:
# kl coefficient and its corresponding assignment operation and placeholder
self.kl_coefficient = tf.Variable(agent_parameters.algorithm.initial_kl_coefficient,
trainable=False, name='kl_coefficient')
self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph')
self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph)
self.kl_cutoff = 2 * agent_parameters.algorithm.target_kl_divergence
self.high_kl_penalty_coefficient = agent_parameters.algorithm.high_kl_penalty_coefficient
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
self.beta = agent_parameters.algorithm.beta_entropy
def _build_module(self, input_layer):
if isinstance(self.spaces.action, DiscreteActionSpace):
self._build_discrete_net(input_layer, self.spaces.action)
elif isinstance(self.spaces.action, BoxActionSpace):
self._build_continuous_net(input_layer, self.spaces.action)
else:
raise ValueError("only discrete or continuous action spaces are supported for PPO")
self.action_probs_wrt_policy = self.policy_distribution.log_prob(self.actions)
self.action_probs_wrt_old_policy = self.old_policy_distribution.log_prob(self.actions)
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
# Used by regular PPO only
# add kl divergence regularization
self.kl_divergence = tf.reduce_mean(tf.distributions.kl_divergence(self.old_policy_distribution, self.policy_distribution))
if self.use_kl_regularization:
# no clipping => use kl regularization
self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence)
self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \
tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff))
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
# calculate surrogate loss
self.advantages = tf.placeholder(tf.float32, [None], name="advantages")
self.target = self.advantages
# action_probs_wrt_old_policy != 0 because it is e^...
self.likelihood_ratio = tf.exp(self.action_probs_wrt_policy - self.action_probs_wrt_old_policy)
if self.clip_likelihood_ratio_using_epsilon is not None:
self.clip_param_rescaler = tf.placeholder(tf.float32, ())
self.input.append(self.clip_param_rescaler)
max_value = 1 + self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
min_value = 1 - self.clip_likelihood_ratio_using_epsilon * self.clip_param_rescaler
self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value)
self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages,
self.clipped_likelihood_ratio * self.advantages)
else:
self.scaled_advantages = self.likelihood_ratio * self.advantages
# minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss)
self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages)
if self.is_local:
# add entropy regularization
if self.beta:
self.entropy = tf.reduce_mean(self.policy_distribution.entropy())
self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization')
tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations)
self.loss = self.surrogate_loss
tf.losses.add_loss(self.loss)
def _build_discrete_net(self, input_layer, action_space):
num_actions = len(action_space.actions)
self.actions = tf.placeholder(tf.int32, [None], name="actions")
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
# Policy Head
self.input = [self.actions, self.old_policy_mean]
policy_values = tf.layers.dense(input_layer, num_actions, name='policy_fc')
self.policy_mean = tf.nn.softmax(policy_values, name="policy")
# define the distributions for the policy and the old policy
self.policy_distribution = tf.contrib.distributions.Categorical(probs=self.policy_mean)
self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean)
self.output = self.policy_mean
def _build_continuous_net(self, input_layer, action_space):
num_actions = action_space.shape[0]
self.actions = tf.placeholder(tf.float32, [None, num_actions], name="actions")
self.old_policy_mean = tf.placeholder(tf.float32, [None, num_actions], "old_policy_mean")
self.old_policy_std = tf.placeholder(tf.float32, [None, num_actions], "old_policy_std")
self.input = [self.actions, self.old_policy_mean, self.old_policy_std]
self.policy_mean = tf.layers.dense(input_layer, num_actions, name='policy_mean',
kernel_initializer=normalized_columns_initializer(0.01))
if self.is_local:
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32',
collections=[tf.GraphKeys.LOCAL_VARIABLES])
else:
self.policy_logstd = tf.Variable(np.zeros((1, num_actions)), dtype='float32')
self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std')
# define the distributions for the policy and the old policy
self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, self.policy_std + eps)
self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, self.old_policy_std + eps)
self.output = [self.policy_mean, self.policy_std]

View File

@@ -0,0 +1,52 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.core_types import ActionProbabilities
class PPOVHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='ppo_v_head_params'):
super().__init__(parameterized_class=PPOVHead, activation_function=activation_function, name=name)
class PPOVHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'ppo_v_head'
self.clip_likelihood_ratio_using_epsilon = agent_parameters.algorithm.clip_likelihood_ratio_using_epsilon
self.return_type = ActionProbabilities
def _build_module(self, input_layer):
self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values")
self.input = [self.old_policy_value]
self.output = tf.layers.dense(input_layer, 1, name='output',
kernel_initializer=normalized_columns_initializer(1.0))
self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return")
value_loss_1 = tf.square(self.output - self.target)
value_loss_2 = tf.square(self.old_policy_value +
tf.clip_by_value(self.output - self.old_policy_value,
-self.clip_likelihood_ratio_using_epsilon,
self.clip_likelihood_ratio_using_epsilon) - self.target)
self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2))
self.loss = self.vf_loss
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,50 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition, BoxActionSpace, DiscreteActionSpace
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class QHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='q_head_params'):
super().__init__(parameterized_class=QHead, activation_function=activation_function, name=name)
class QHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'q_values_head'
if isinstance(self.spaces.action, BoxActionSpace):
self.num_actions = 1
elif isinstance(self.spaces.action, DiscreteActionSpace):
self.num_actions = len(self.spaces.action.actions)
self.return_type = QActionStateValue
if agent_parameters.network_wrappers[self.network_name].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# Standard Q Network
self.output = tf.layers.dense(input_layer, self.num_actions, name='output')

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.core_types import QActionStateValue
class QuantileRegressionQHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='quantile_regression_q_head_params'):
super().__init__(parameterized_class=QuantileRegressionQHead, activation_function=activation_function,
name=name)
class QuantileRegressionQHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'quantile_regression_dqn_head'
self.num_actions = len(self.spaces.action.actions)
self.num_atoms = agent_parameters.algorithm.atoms # we use atom / quantile interchangeably
self.huber_loss_interval = agent_parameters.algorithm.huber_loss_interval # k
self.return_type = QActionStateValue
def _build_module(self, input_layer):
self.actions = tf.placeholder(tf.int32, [None, 2], name="actions")
self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints")
self.input = [self.actions, self.quantile_midpoints]
# the output of the head is the N unordered quantile locations {theta_1, ..., theta_N}
quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms))
self.output = quantiles_locations
self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles")
self.target = self.quantiles
# only the quantiles of the taken action are taken into account
quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions)
# reorder the output quantiles and the target quantiles as a preparation step for calculating the loss
# the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles)
# the target quantiles vector is tiled as column of a NxN matrix
theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms])
T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1])
tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms])
# Huber loss of T(theta_j) - theta_i
error = T_theta_j - theta_i
abs_error = tf.abs(error)
quadratic = tf.minimum(abs_error, self.huber_loss_interval)
huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2
# Quantile Huber loss
quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss
# Quantile regression loss (the probability for each quantile is 1/num_quantiles)
quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms)
self.loss = quantile_regression_loss
tf.losses.add_loss(self.loss)

View File

@@ -0,0 +1,45 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow as tf
from rl_coach.base_parameters import AgentParameters
from rl_coach.spaces import SpacesDefinition
from rl_coach.architectures.tensorflow_components.heads.head import Head, normalized_columns_initializer, HeadParameters
from rl_coach.core_types import VStateValue
class VHeadParameters(HeadParameters):
def __init__(self, activation_function: str ='relu', name: str='v_head_params'):
super().__init__(parameterized_class=VHead, activation_function=activation_function, name=name)
class VHead(Head):
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str='relu'):
super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
self.name = 'v_values_head'
self.return_type = VStateValue
if agent_parameters.network_wrappers[self.network_name.split('/')[0]].replace_mse_with_huber_loss:
self.loss_type = tf.losses.huber_loss
else:
self.loss_type = tf.losses.mean_squared_error
def _build_module(self, input_layer):
# Standard V Network
self.output = tf.layers.dense(input_layer, 1, name='output',
kernel_initializer=normalized_columns_initializer(1.0))

View File

@@ -0,0 +1,86 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union, List
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
from rl_coach.base_parameters import MiddlewareScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout, Dense
from rl_coach.core_types import Middleware_FC_Embedding
class FCMiddlewareParameters(MiddlewareParameters):
def __init__(self, activation_function='relu',
scheme: Union[List, MiddlewareScheme] = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_fc_embedder"):
super().__init__(parameterized_class=FCMiddleware, activation_function=activation_function,
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
class FCMiddleware(Middleware):
schemes = {
MiddlewareScheme.Empty:
[],
# ppo
MiddlewareScheme.Shallow:
[
Dense([64])
],
# dqn
MiddlewareScheme.Medium:
[
Dense([512])
],
MiddlewareScheme.Deep: \
[
Dense([128]),
Dense([128]),
Dense([128])
]
}
def __init__(self, activation_function=tf.nn.relu,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_fc_embedder"):
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
dropout=dropout, scheme=scheme, name=name)
self.return_type = Middleware_FC_Embedding
self.layers = []
def _build_module(self):
self.layers.append(self.input)
if isinstance(self.scheme, MiddlewareScheme):
layers_params = FCMiddleware.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
self.output = self.layers[-1]

View File

@@ -0,0 +1,113 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
from rl_coach.architectures.tensorflow_components.middlewares.middleware import Middleware, MiddlewareParameters
from rl_coach.base_parameters import MiddlewareScheme
from rl_coach.architectures.tensorflow_components.architecture import batchnorm_activation_dropout
from rl_coach.core_types import Middleware_LSTM_Embedding
class LSTMMiddlewareParameters(MiddlewareParameters):
def __init__(self, activation_function='relu', number_of_lstm_cells=256,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_lstm_embedder"):
super().__init__(parameterized_class=LSTMMiddleware, activation_function=activation_function,
scheme=scheme, batchnorm=batchnorm, dropout=dropout, name=name)
self.number_of_lstm_cells = number_of_lstm_cells
class LSTMMiddleware(Middleware):
schemes = {
MiddlewareScheme.Empty:
[],
# ppo
MiddlewareScheme.Shallow:
[
[64]
],
# dqn
MiddlewareScheme.Medium:
[
[512]
],
MiddlewareScheme.Deep: \
[
[128],
[128],
[128]
]
}
def __init__(self, activation_function=tf.nn.relu, number_of_lstm_cells: int=256,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False,
name="middleware_lstm_embedder"):
super().__init__(activation_function=activation_function, batchnorm=batchnorm,
dropout=dropout, scheme=scheme, name=name)
self.return_type = Middleware_LSTM_Embedding
self.number_of_lstm_cells = number_of_lstm_cells
self.layers = []
def _build_module(self):
"""
self.state_in: tuple of placeholders containing the initial state
self.state_out: tuple of output state
todo: it appears that the shape of the output is batch, feature
the code here seems to be slicing off the first element in the batch
which would definitely be wrong. need to double check the shape
"""
self.layers.append(self.input)
# optionally insert some dense layers before the LSTM
if isinstance(self.scheme, MiddlewareScheme):
layers_params = LSTMMiddleware.schemes[self.scheme]
else:
layers_params = self.scheme
for idx, layer_params in enumerate(layers_params):
self.layers.append(
tf.layers.dense(self.layers[-1], layer_params[0], name='fc{}'.format(idx))
)
self.layers.extend(batchnorm_activation_dropout(self.layers[-1], self.batchnorm,
self.activation_function, self.dropout,
self.dropout_rate, idx))
# add the LSTM layer
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True)
self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
self.state_init = [self.c_init, self.h_init]
self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c])
self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h])
self.state_in = (self.c_in, self.h_in)
rnn_in = tf.expand_dims(self.layers[-1], [0])
step_size = tf.shape(self.layers[-1])[:1]
state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in)
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(
lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False)
lstm_c, lstm_h = lstm_state
self.state_out = (lstm_c[:1, :], lstm_h[:1, :])
self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])

View File

@@ -0,0 +1,68 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Type, Union, List
import tensorflow as tf
from rl_coach.base_parameters import MiddlewareScheme, Parameters
from rl_coach.core_types import MiddlewareEmbedding
class MiddlewareParameters(Parameters):
def __init__(self, parameterized_class: Type['Middleware'],
activation_function: str='relu', scheme: Union[List, MiddlewareScheme]=MiddlewareScheme.Medium,
batchnorm: bool=False, dropout: bool=False,
name='middleware'):
super().__init__()
self.activation_function = activation_function
self.scheme = scheme
self.batchnorm = batchnorm
self.dropout = dropout
self.name = name
self.parameterized_class_name = parameterized_class.__name__
class Middleware(object):
"""
A middleware embedder is the middle part of the network. It takes the embeddings from the input embedders,
after they were aggregated in some method (for example, concatenation) and passes it through a neural network
which can be customizable but shared between the heads of the network
"""
def __init__(self, activation_function=tf.nn.relu,
scheme: MiddlewareScheme = MiddlewareScheme.Medium,
batchnorm: bool = False, dropout: bool = False, name="middleware_embedder"):
self.name = name
self.input = None
self.output = None
self.activation_function = activation_function
self.batchnorm = batchnorm
self.dropout = dropout
self.dropout_rate = 0
self.scheme = scheme
self.return_type = MiddlewareEmbedding
def __call__(self, input_layer):
with tf.variable_scope(self.get_name()):
self.input = input_layer
self._build_module()
return self.input, self.output
def _build_module(self):
pass
def get_name(self):
return self.name

View File

@@ -0,0 +1,121 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
import tensorflow as tf
class SharedRunningStats(object):
def __init__(self, replicated_device=None, epsilon=1e-2, name="", create_ops=True):
self.sess = None
self.name = name
self.replicated_device = replicated_device
self.epsilon = epsilon
self.ops_were_created = False
if create_ops:
with tf.device(replicated_device):
self.create_ops()
def create_ops(self, shape=[1], clip_values=None):
self.clip_values = clip_values
with tf.variable_scope(self.name):
self._sum = tf.get_variable(
dtype=tf.float64,
initializer=tf.constant_initializer(0.0),
name="running_sum", trainable=False, shape=shape, validate_shape=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._sum_squared = tf.get_variable(
dtype=tf.float64,
initializer=tf.constant_initializer(self.epsilon),
name="running_sum_squared", trainable=False, shape=shape, validate_shape=False,
collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._count = tf.get_variable(
dtype=tf.float64,
shape=(),
initializer=tf.constant_initializer(self.epsilon),
name="count", trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES])
self._shape = None
self._mean = tf.div(self._sum, self._count, name="mean")
self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean))
/ tf.maximum(self._count-1, 1), self.epsilon), name="stdev")
self.tf_mean = tf.cast(self._mean, 'float32')
self.tf_std = tf.cast(self._std, 'float32')
self.new_sum = tf.placeholder(dtype=tf.float64, name='sum')
self.new_sum_squared = tf.placeholder(dtype=tf.float64, name='var')
self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True)
self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True)
self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True)
self.raw_obs = tf.placeholder(dtype=tf.float64, name='raw_obs')
self.normalized_obs = (self.raw_obs - self._mean) / self._std
if self.clip_values is not None:
self.clipped_obs = tf.clip_by_value(self.normalized_obs, self.clip_values[0], self.clip_values[1])
self.ops_were_created = True
def set_session(self, sess):
self.sess = sess
def push(self, x):
x = x.astype('float64')
self.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count],
feed_dict={
self.new_sum: x.sum(axis=0).ravel(),
self.new_sum_squared: np.square(x).sum(axis=0).ravel(),
self.newcount: np.array(len(x), dtype='float64')
})
if self._shape is None:
self._shape = x.shape
@property
def n(self):
return self.sess.run(self._count)
@property
def mean(self):
return self.sess.run(self._mean)
@property
def var(self):
return self.std ** 2
@property
def std(self):
return self.sess.run(self._std)
@property
def shape(self):
return self._shape
@shape.setter
def shape(self, val):
self._shape = val
self.new_sum.set_shape(val)
self.new_sum_squared.set_shape(val)
self.tf_mean.set_shape(val)
self.tf_std.set_shape(val)
self._sum.set_shape(val)
self._sum_squared.set_shape(val)
def normalize(self, batch):
if self.clip_values is not None:
return self.sess.run(self.clipped_obs, feed_dict={self.raw_obs: batch})
else:
return self.sess.run(self.normalized_obs, feed_dict={self.raw_obs: batch})

350
rl_coach/base_parameters.py Normal file
View File

@@ -0,0 +1,350 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import inspect
import json
import os
import sys
import types
from collections import OrderedDict
from enum import Enum
from typing import Dict, List, Union
from rl_coach.core_types import TrainingSteps, EnvironmentSteps, GradientClippingMethod
from rl_coach.filters.filter import NoInputFilter
class Frameworks(Enum):
tensorflow = "TensorFlow"
class EmbedderScheme(Enum):
Empty = "Empty"
Shallow = "Shallow"
Medium = "Medium"
Deep = "Deep"
class MiddlewareScheme(Enum):
Empty = "Empty"
Shallow = "Shallow"
Medium = "Medium"
Deep = "Deep"
class EmbeddingMergerType(Enum):
Concat = 0
Sum = 1
#ConcatDepthWise = 2
#Multiply = 3
def iterable_to_items(obj):
if isinstance(obj, dict) or isinstance(obj, OrderedDict) or isinstance(obj, types.MappingProxyType):
items = obj.items()
elif isinstance(obj, list):
items = enumerate(obj)
else:
raise ValueError("The given object is not a dict or a list")
return items
def unfold_dict_or_list(obj: Union[Dict, List, OrderedDict]):
"""
Recursively unfolds all the parameters in dictionaries and lists
:param obj: a dictionary or list to unfold
:return: the unfolded parameters dictionary
"""
parameters = OrderedDict()
items = iterable_to_items(obj)
for k, v in items:
if isinstance(v, dict) or isinstance(v, list) or isinstance(v, OrderedDict):
if 'tensorflow.' not in str(v.__class__):
parameters[k] = unfold_dict_or_list(v)
elif 'tensorflow.' in str(v.__class__):
parameters[k] = v
elif hasattr(v, '__dict__'):
sub_params = v.__dict__
if '__objclass__' not in sub_params.keys():
try:
parameters[k] = unfold_dict_or_list(sub_params)
except RecursionError:
parameters[k] = sub_params
parameters[k]['__class__'] = v.__class__.__name__
else:
# unfolding this type of object will result in infinite recursion
parameters[k] = sub_params
else:
parameters[k] = v
if not isinstance(obj, OrderedDict) and not isinstance(obj, list):
parameters = OrderedDict(sorted(parameters.items()))
return parameters
class Parameters(object):
def __setattr__(self, key, value):
caller_name = sys._getframe(1).f_code.co_name
if caller_name != '__init__' and not hasattr(self, key):
raise TypeError("Parameter '{}' does not exist in {}. Parameters are only to be defined in a constructor of"
" a class inheriting from Parameters. In order to explicitly register a new parameter "
"outside of a constructor use register_var().".
format(key, self.__class__))
object.__setattr__(self, key, value)
@property
def path(self):
if hasattr(self, 'parameterized_class_name'):
module_path = os.path.relpath(inspect.getfile(self.__class__), os.getcwd())[:-3] + '.py'
return ':'.join([module_path, self.parameterized_class_name])
else:
raise ValueError("The parameters class does not have an attached class it parameterizes. "
"The self.parameterized_class_name should be set to the parameterized class.")
def register_var(self, key, value):
if hasattr(self, key):
raise TypeError("Cannot register an already existing parameter '{}'. ".format(key))
object.__setattr__(self, key, value)
def __str__(self):
result = "\"{}\" {}\n".format(self.__class__.__name__,
json.dumps(unfold_dict_or_list(self.__dict__), indent=4, default=repr))
return result
class AlgorithmParameters(Parameters):
def __init__(self):
# Architecture parameters
self.use_accumulated_reward_as_measurement = False
# Agent parameters
self.num_consecutive_playing_steps = EnvironmentSteps(1)
self.num_consecutive_training_steps = 1 # TODO: update this to TrainingSteps
self.heatup_using_network_decisions = False
self.discount = 0.99
self.apply_gradients_every_x_episodes = 5
self.num_steps_between_copying_online_weights_to_target = TrainingSteps(0)
self.rate_for_copying_weights_to_target = 1.0
self.load_memory_from_file_path = None
self.collect_new_data = True
# HRL / HER related params
self.in_action_space = None
# distributed agents params
self.share_statistics_between_workers = True
# intrinsic reward
self.scale_external_reward_by_intrinsic_reward_value = False
class PresetValidationParameters(Parameters):
def __init__(self):
super().__init__()
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
# the form of different workers starting at different times, and getting different assignments of CPU
# time from the OS.
# Testing parameters
self.test = False
self.min_reward_threshold = 0
self.max_episodes_to_achieve_reward = 1
self.num_workers = 1
self.reward_test_level = None
self.trace_test_levels = None
self.trace_max_env_steps = 5000
class NetworkParameters(Parameters):
def __init__(self):
super().__init__()
self.framework = Frameworks.tensorflow
self.sess = None
# hardware parameters
self.force_cpu = False
# distributed training options
self.num_threads = 1
self.synchronize_over_num_threads = 1
self.distributed = False
self.async_training = False
self.shared_optimizer = True
self.scale_down_gradients_by_number_of_workers_for_sync_training = True
# regularization
self.clip_gradients = None
self.gradients_clipping_method = GradientClippingMethod.ClipByGlobalNorm
self.kl_divergence_constraint = None
self.l2_regularization = 0
# learning rate
self.learning_rate = 0.00025
self.learning_rate_decay_rate = 0
self.learning_rate_decay_steps = 0
# structure
self.input_embedders_parameters = []
self.embedding_merger_type = EmbeddingMergerType.Concat
self.middleware_parameters = None
self.heads_parameters = []
self.num_output_head_copies = 1
self.loss_weights = []
self.rescale_gradient_from_head_by_factor = [1]
self.use_separate_networks_per_head = False
self.optimizer_type = 'Adam'
self.optimizer_epsilon = 0.0001
self.adam_optimizer_beta1 = 0.9
self.adam_optimizer_beta2 = 0.99
self.rms_prop_optimizer_decay = 0.9
self.batch_size = 32
self.replace_mse_with_huber_loss = False
self.create_target_network = False
# Framework support
self.tensorflow_support = True
class InputEmbedderParameters(Parameters):
def __init__(self, activation_function: str='relu', scheme: Union[List, EmbedderScheme]=EmbedderScheme.Medium,
batchnorm: bool=False, dropout=False, name: str='embedder', input_rescaling=None, input_offset=None,
input_clipping=None):
super().__init__()
self.activation_function = activation_function
self.scheme = scheme
self.batchnorm = batchnorm
self.dropout = dropout
if input_rescaling is None:
input_rescaling = {'image': 255.0, 'vector': 1.0}
if input_offset is None:
input_offset = {'image': 0.0, 'vector': 0.0}
self.input_rescaling = input_rescaling
self.input_offset = input_offset
self.input_clipping = input_clipping
self.name = name
@property
def path(self):
return {
"image": 'image_embedder:ImageEmbedder',
"vector": 'vector_embedder:VectorEmbedder'
}
class VisualizationParameters(Parameters):
def __init__(self):
super().__init__()
# Visualization parameters
self.print_summary = True
self.dump_csv = True
self.dump_gifs = False
self.dump_mp4 = False
self.dump_signals_to_csv_every_x_episodes = 5
self.dump_in_episode_signals = False
self.dump_parameters_documentation = True
self.render = False
self.native_rendering = False
self.max_fps_for_human_control = 10
self.tensorboard = False
self.video_dump_methods = [] # a list of dump methods which will be checked one after the other until the first
# dump method that returns false for should_dump()
self.add_rendered_image_to_env_response = False
class AgentParameters(Parameters):
def __init__(self, algorithm: AlgorithmParameters, exploration: 'ExplorationParameters', memory: 'MemoryParameters',
networks: Dict[str, NetworkParameters], visualization: VisualizationParameters=VisualizationParameters()):
"""
:param algorithm: the algorithmic parameters
:param exploration: the exploration policy parameters
:param memory: the memory module parameters
:param networks: the parameters for the networks of the agent
:param visualization: the visualization parameters
"""
super().__init__()
self.visualization = visualization
self.algorithm = algorithm
self.exploration = exploration
self.memory = memory
self.network_wrappers = networks
self.input_filter = None
self.output_filter = None
self.pre_network_filter = NoInputFilter()
self.full_name_id = None # TODO: do we really want to hold this parameters here?
self.name = None
self.is_a_highest_level_agent = True
self.is_a_lowest_level_agent = True
self.task_parameters = None
@property
def path(self):
return 'rl_coach.agents.agent:Agent'
class TaskParameters(Parameters):
def __init__(self, framework_type: str, evaluate_only: bool=False, use_cpu: bool=False, experiment_path=None,
seed=None):
"""
:param framework_type: deep learning framework type. currently only tensorflow is supported
:param evaluate_only: the task will be used only for evaluating the model
:param use_cpu: use the cpu for this task
:param experiment_path: the path to the directory which will store all the experiment outputs
:param seed: a seed to use for the random numbers generator
"""
self.framework_type = framework_type
self.task_index = None # TODO: not really needed
self.evaluate_only = evaluate_only
self.use_cpu = use_cpu
self.experiment_path = experiment_path
self.seed = seed
class DistributedTaskParameters(TaskParameters):
def __init__(self, framework_type: str, parameters_server_hosts: str, worker_hosts: str, job_type: str,
task_index: int, evaluate_only: bool=False, num_tasks: int=None,
num_training_tasks: int=None, use_cpu: bool=False, experiment_path=None, dnd=None,
shared_memory_scratchpad=None, seed=None):
"""
:param framework_type: deep learning framework type. currently only tensorflow is supported
:param evaluate_only: the task will be used only for evaluating the model
:param parameters_server_hosts: comma-separated list of hostname:port pairs to which the parameter servers are
assigned
:param worker_hosts: comma-separated list of hostname:port pairs to which the workers are assigned
:param job_type: the job type - either ps (short for parameters server) or worker
:param task_index: the index of the process
:param num_tasks: the number of total tasks that are running (not including the parameters server)
:param num_training_tasks: the number of tasks that are training (not including the parameters server)
:param use_cpu: use the cpu for this task
:param experiment_path: the path to the directory which will store all the experiment outputs
:param dnd: an external DND to use for NEC. This is a workaround needed for a shared DND not using the scratchpad.
:param seed: a seed to use for the random numbers generator
"""
super().__init__(framework_type=framework_type, evaluate_only=evaluate_only, use_cpu=use_cpu,
experiment_path=experiment_path, seed=seed)
self.parameters_server_hosts = parameters_server_hosts
self.worker_hosts = worker_hosts
self.job_type = job_type
self.task_index = task_index
self.num_tasks = num_tasks
self.num_training_tasks = num_training_tasks
self.device = None # the replicated device which will be used for the global parameters
self.worker_target = None
self.dnd = dnd
self.shared_memory_scratchpad = shared_memory_scratchpad

402
rl_coach/coach.py Normal file
View File

@@ -0,0 +1,402 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys
sys.path.append('.')
import copy
from rl_coach.core_types import EnvironmentSteps
import os
from rl_coach import logger
import traceback
from rl_coach.logger import screen, failed_imports
import argparse
import atexit
import time
import sys
from rl_coach.base_parameters import Frameworks, VisualizationParameters, TaskParameters, DistributedTaskParameters
from multiprocessing import Process
from multiprocessing.managers import BaseManager
import subprocess
from rl_coach.graph_managers.graph_manager import HumanPlayScheduleParameters, GraphManager
from rl_coach.utils import list_all_presets, short_dynamic_import, get_open_port, SharedMemoryScratchPad, get_base_dir
from rl_coach.agents.human_agent import HumanAgentParameters
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.environments.environment import SingleLevelSelection
if len(set(failed_imports)) > 0:
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
def get_graph_manager_from_args(args: argparse.Namespace) -> 'GraphManager':
"""
Return the graph manager according to the command line arguments given by the user
:param args: the arguments given by the user
:return: the updated graph manager
"""
graph_manager = None
# if a preset was given we will load the graph manager for the preset
if args.preset is not None:
graph_manager = short_dynamic_import(args.preset, ignore_module_case=True)
# for human play we need to create a custom graph manager
if args.play:
env_params = short_dynamic_import(args.environment_type, ignore_module_case=True)()
env_params.human_control = True
schedule_params = HumanPlayScheduleParameters()
graph_manager = BasicRLGraphManager(HumanAgentParameters(), env_params, schedule_params, VisualizationParameters())
if args.level:
if isinstance(graph_manager.env_params.level, SingleLevelSelection):
graph_manager.env_params.level.select(args.level)
else:
graph_manager.env_params.level = args.level
# set the seed for the environment
if args.seed is not None:
graph_manager.env_params.seed = args.seed
# visualization
graph_manager.visualization_parameters.dump_gifs = graph_manager.visualization_parameters.dump_gifs or args.dump_gifs
graph_manager.visualization_parameters.dump_mp4 = graph_manager.visualization_parameters.dump_mp4 or args.dump_mp4
graph_manager.visualization_parameters.render = args.render
graph_manager.visualization_parameters.tensorboard = args.tensorboard
# update the custom parameters
if args.custom_parameter is not None:
unstripped_key_value_pairs = [pair.split('=') for pair in args.custom_parameter.split(';')]
stripped_key_value_pairs = [tuple([pair[0].strip(), pair[1].strip()]) for pair in
unstripped_key_value_pairs if len(pair) == 2]
# load custom parameters into run_dict
for key, value in stripped_key_value_pairs:
exec("graph_manager.{}={}".format(key, value))
return graph_manager
def parse_arguments(parser: argparse.ArgumentParser) -> argparse.Namespace:
"""
Parse the arguments that the user entered
:param parser: the argparse command line parser
:return: the parsed arguments
"""
args = parser.parse_args()
# if no arg is given
if len(sys.argv) == 1:
parser.print_help()
exit(0)
# list available presets
preset_names = list_all_presets()
if args.list:
screen.log_title("Available Presets:")
for preset in sorted(preset_names):
print(preset)
sys.exit(0)
# replace a short preset name with the full path
if args.preset is not None:
if args.preset.lower() in [p.lower() for p in preset_names]:
args.preset = "{}.py:graph_manager".format(os.path.join(get_base_dir(), 'presets', args.preset))
else:
args.preset = "{}".format(args.preset)
# verify that the preset exists
preset_path = args.preset.split(":")[0]
if not os.path.exists(preset_path):
screen.error("The given preset ({}) cannot be found.".format(args.preset))
# verify that the preset can be instantiated
try:
short_dynamic_import(args.preset, ignore_module_case=True)
except TypeError as e:
traceback.print_exc()
screen.error('Internal Error: ' + str(e) + "\n\nThe given preset ({}) cannot be instantiated."
.format(args.preset))
# validate the checkpoints args
if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
screen.error("The requested checkpoint folder to load from does not exist.")
# no preset was given. check if the user requested to play some environment on its own
if args.preset is None and args.play:
if args.environment_type:
args.agent_type = 'Human'
else:
screen.error('When no preset is given for Coach to run, and the user requests human control over '
'the environment, the user is expected to input the desired environment_type and level.'
'\nAt least one of these parameters was not given.')
elif args.preset and args.play:
screen.error("Both the --preset and the --play flags were set. These flags can not be used together. "
"For human control, please use the --play flag together with the environment type flag (-et)")
elif args.preset is None and not args.play:
screen.error("Please choose a preset using the -p flag or use the --play flag together with choosing an "
"environment type (-et) in order to play the game.")
# get experiment name and path
args.experiment_name = logger.get_experiment_name(args.experiment_name)
args.experiment_path = logger.get_experiment_path(args.experiment_name)
if args.play and args.num_workers > 1:
screen.warning("Playing the game as a human is only available with a single worker. "
"The number of workers will be reduced to 1")
args.num_workers = 1
args.framework = Frameworks[args.framework.lower()]
# checkpoints
args.save_checkpoint_dir = os.path.join(args.experiment_path, 'checkpoint') if args.save_checkpoint_secs is not None else None
return args
def add_items_to_dict(target_dict, source_dict):
updated_task_parameters = copy.copy(source_dict)
updated_task_parameters.update(target_dict)
return updated_task_parameters
def open_dashboard(experiment_path):
dashboard_path = 'python {}/dashboard.py'.format(get_base_dir())
cmd = "{} --experiment_dir {}".format(dashboard_path, experiment_path)
screen.log_title("Opening dashboard - experiment path: {}".format(experiment_path))
# subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True, executable="/bin/bash")
subprocess.Popen(cmd, shell=True, executable="/bin/bash")
def start_graph(graph_manager: 'GraphManager', task_parameters: 'TaskParameters'):
graph_manager.create_graph(task_parameters)
# let the adventure begin
if task_parameters.evaluate_only:
graph_manager.evaluate(EnvironmentSteps(sys.maxsize), keep_networks_in_sync=True)
else:
graph_manager.improve()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--preset',
help="(string) Name of a preset to run (class name from the 'presets' directory.)",
default=None,
type=str)
parser.add_argument('-l', '--list',
help="(flag) List all available presets",
action='store_true')
parser.add_argument('-e', '--experiment_name',
help="(string) Experiment name to be used to store the results.",
default='',
type=str)
parser.add_argument('-r', '--render',
help="(flag) Render environment",
action='store_true')
parser.add_argument('-f', '--framework',
help="(string) Neural network framework. Available values: tensorflow",
default='tensorflow',
type=str)
parser.add_argument('-n', '--num_workers',
help="(int) Number of workers for multi-process based agents, e.g. A3C",
default=1,
type=int)
parser.add_argument('-c', '--use_cpu',
help="(flag) Use only the cpu for training. If a GPU is not available, this flag will have no "
"effect and the CPU will be used either way.",
action='store_true')
parser.add_argument('-ew', '--evaluation_worker',
help="(int) If multiple workers are used, add an evaluation worker as well which will "
"evaluate asynchronously and independently during the training. NOTE: this worker will "
"ignore the evaluation settings in the preset's ScheduleParams.",
action='store_true')
parser.add_argument('--play',
help="(flag) Play as a human by controlling the game with the keyboard. "
"This option will save a replay buffer with the game play.",
action='store_true')
parser.add_argument('--evaluate',
help="(flag) Run evaluation only. This is a convenient way to disable "
"training in order to evaluate an existing checkpoint.",
action='store_true')
parser.add_argument('-v', '--verbosity',
help="(flag) Sets the verbosity level of Coach print outs. Can be either low or high.",
default="low",
type=str)
parser.add_argument('-tfv', '--tf_verbosity',
help="(flag) TensorFlow verbosity level",
default=3,
type=int)
parser.add_argument('-s', '--save_checkpoint_secs',
help="(int) Time in seconds between saving checkpoints of the model.",
default=None,
type=int)
parser.add_argument('-crd', '--checkpoint_restore_dir',
help='(string) Path to a folder containing a checkpoint to restore the model from.',
type=str)
parser.add_argument('-dg', '--dump_gifs',
help="(flag) Enable the gif saving functionality.",
action='store_true')
parser.add_argument('-dm', '--dump_mp4',
help="(flag) Enable the mp4 saving functionality.",
action='store_true')
parser.add_argument('-at', '--agent_type',
help="(string) Choose an agent type class to override on top of the selected preset. "
"If no preset is defined, a preset can be set from the command-line by combining settings "
"which are set by using --agent_type, --experiment_type, --environemnt_type",
default=None,
type=str)
parser.add_argument('-et', '--environment_type',
help="(string) Choose an environment type class to override on top of the selected preset."
"If no preset is defined, a preset can be set from the command-line by combining settings "
"which are set by using --agent_type, --experiment_type, --environemnt_type",
default=None,
type=str)
parser.add_argument('-ept', '--exploration_policy_type',
help="(string) Choose an exploration policy type class to override on top of the selected "
"preset."
"If no preset is defined, a preset can be set from the command-line by combining settings "
"which are set by using --agent_type, --experiment_type, --environemnt_type"
,
default=None,
type=str)
parser.add_argument('-lvl', '--level',
help="(string) Choose the level that will be played in the environment that was selected."
"This value will override the level parameter in the environment class."
,
default=None,
type=str)
parser.add_argument('-cp', '--custom_parameter',
help="(string) Semicolon separated parameters used to override specific parameters on top of"
" the selected preset (or on top of the command-line assembled one). "
"Whenever a parameter value is a string, it should be inputted as '\\\"string\\\"'. "
"For ex.: "
"\"visualization.render=False; num_training_iterations=500; optimizer='rmsprop'\"",
default=None,
type=str)
parser.add_argument('--print_parameters',
help="(flag) Print tuning_parameters to stdout",
action='store_true')
parser.add_argument('-tb', '--tensorboard',
help="(flag) When using the TensorFlow backend, enable TensorBoard log dumps. ",
action='store_true')
parser.add_argument('-ns', '--no_summary',
help="(flag) Prevent Coach from printing a summary and asking questions at the end of runs",
action='store_true')
parser.add_argument('-d', '--open_dashboard',
help="(flag) Open dashboard with the experiment when the run starts",
action='store_true')
parser.add_argument('--seed',
help="(int) A seed to use for running the experiment",
default=None,
type=int)
args = parse_arguments(parser)
graph_manager = get_graph_manager_from_args(args)
# Intel optimized TF seems to run significantly faster when limiting to a single OMP thread.
# This will not affect GPU runs.
os.environ["OMP_NUM_THREADS"] = "1"
# turn TF debug prints off
if args.framework == Frameworks.tensorflow:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(args.tf_verbosity)
# turn off the summary at the end of the run if necessary
if not args.no_summary:
atexit.register(logger.summarize_experiment)
screen.change_terminal_title(args.experiment_name)
# open dashboard
if args.open_dashboard:
open_dashboard(args.experiment_path)
# Single-threaded runs
if args.num_workers == 1:
# Start the training or evaluation
task_parameters = TaskParameters(framework_type="tensorflow", # TODO: tensorflow should'nt be hardcoded
evaluate_only=args.evaluate,
experiment_path=args.experiment_path,
seed=args.seed,
use_cpu=args.use_cpu)
task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
start_graph(graph_manager=graph_manager, task_parameters=task_parameters)
# Multi-threaded runs
else:
total_tasks = args.num_workers
if args.evaluation_worker:
total_tasks += 1
ps_hosts = "localhost:{}".format(get_open_port())
worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(total_tasks)])
# Shared memory
class CommManager(BaseManager):
pass
CommManager.register('SharedMemoryScratchPad', SharedMemoryScratchPad, exposed=['add', 'get', 'internal_call'])
comm_manager = CommManager()
comm_manager.start()
shared_memory_scratchpad = comm_manager.SharedMemoryScratchPad()
def start_distributed_task(job_type, task_index, evaluation_worker=False,
shared_memory_scratchpad=shared_memory_scratchpad):
task_parameters = DistributedTaskParameters(framework_type="tensorflow", # TODO: tensorflow should'nt be hardcoded
parameters_server_hosts=ps_hosts,
worker_hosts=worker_hosts,
job_type=job_type,
task_index=task_index,
evaluate_only=evaluation_worker,
use_cpu=args.use_cpu,
num_tasks=total_tasks, # training tasks + 1 evaluation task
num_training_tasks=args.num_workers,
experiment_path=args.experiment_path,
shared_memory_scratchpad=shared_memory_scratchpad,
seed=args.seed+task_index if args.seed is not None else None) # each worker gets a different seed
task_parameters.__dict__ = add_items_to_dict(task_parameters.__dict__, args.__dict__)
# we assume that only the evaluation workers are rendering
graph_manager.visualization_parameters.render = args.render and evaluation_worker
p = Process(target=start_graph, args=(graph_manager, task_parameters))
# p.daemon = True
p.start()
return p
# parameter server
parameter_server = start_distributed_task("ps", 0)
# training workers
# wait a bit before spawning the non chief workers in order to make sure the session is already created
workers = []
workers.append(start_distributed_task("worker", 0))
time.sleep(2)
for task_index in range(1, args.num_workers):
workers.append(start_distributed_task("worker", task_index))
# evaluation worker
if args.evaluation_worker:
evaluation_worker = start_distributed_task("worker", args.num_workers, evaluation_worker=True)
# wait for all workers
[w.join() for w in workers]
if args.evaluation_worker:
evaluation_worker.terminate()
if __name__ == "__main__":
main()

687
rl_coach/core_types.py Normal file
View File

@@ -0,0 +1,687 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from enum import Enum
from typing import List, Union, Dict, Any, Type
from random import shuffle
import numpy as np
import copy
ActionType = Union[int, float, np.ndarray, List]
GoalType = Union[None, np.ndarray]
ObservationType = np.ndarray
RewardType = Union[int, float, np.ndarray]
StateType = Dict[str, np.ndarray]
class GoalTypes(Enum):
Embedding = 1
EmbeddingChange = 2
Observation = 3
Measurements = 4
# step methods
class StepMethod(object):
def __init__(self, num_steps: int):
self._num_steps = self.num_steps = num_steps
@property
def num_steps(self) -> int:
return self._num_steps
@num_steps.setter
def num_steps(self, val: int) -> None:
self._num_steps = val
class Frames(StepMethod):
def __init__(self, num_steps):
super().__init__(num_steps)
class EnvironmentSteps(StepMethod):
def __init__(self, num_steps):
super().__init__(num_steps)
class EnvironmentEpisodes(StepMethod):
def __init__(self, num_steps):
super().__init__(num_steps)
class TrainingSteps(StepMethod):
def __init__(self, num_steps):
super().__init__(num_steps)
class Time(StepMethod):
def __init__(self, num_steps):
super().__init__(num_steps)
class PredictionType(object):
pass
class VStateValue(PredictionType):
pass
class QActionStateValue(PredictionType):
pass
class ActionProbabilities(PredictionType):
pass
class Embedding(PredictionType):
pass
class InputEmbedding(Embedding):
pass
class MiddlewareEmbedding(Embedding):
pass
class InputImageEmbedding(InputEmbedding):
pass
class InputVectorEmbedding(InputEmbedding):
pass
class Middleware_FC_Embedding(MiddlewareEmbedding):
pass
class Middleware_LSTM_Embedding(MiddlewareEmbedding):
pass
class Measurements(PredictionType):
pass
PlayingStepsType = Union[EnvironmentSteps, EnvironmentEpisodes, Frames]
# run phases
class RunPhase(Enum):
HEATUP = "Heatup"
TRAIN = "Training"
TEST = "Testing"
UNDEFINED = "Undefined"
# transitions
class Transition(object):
def __init__(self, state: Dict[str, np.ndarray]=None, action: ActionType=None, reward: RewardType=None,
next_state: Dict[str, np.ndarray]=None, game_over: bool=None, info: Dict=None):
"""
A transition is a tuple containing the information of a single step of interaction
between the agent and the environment. The most basic version should contain the following values:
(current state, action, reward, next state, game over)
For imitation learning algorithms, if the reward, next state or game over is not known,
it is sufficient to store the current state and action taken by the expert.
:param state: The current state. Assumed to be a dictionary where the observation
is located at state['observation']
:param action: The current action that was taken
:param reward: The reward received from the environment
:param next_state: The next state of the environment after applying the action.
The next state should be similar to the state in its structure.
:param game_over: A boolean which should be True if the episode terminated after
the execution of the action.
:param info: A dictionary containing any additional information to be stored in the transition
"""
self._state = self.state = state
self._action = self.action = action
self._reward = self.reward = reward
self._total_return = self.total_return = None
if not next_state:
next_state = state
self._next_state = self._next_state = next_state
self._game_over = self.game_over = game_over
if info is None:
self.info = {}
else:
self.info = info
def __repr__(self):
return str(self.__dict__)
@property
def state(self):
if self._state is None:
raise Exception("The state was not filled by any of the modules between the environment and the agent")
return self._state
@state.setter
def state(self, val):
self._state = val
@property
def action(self):
if self._action is None:
raise Exception("The action was not filled by any of the modules between the environment and the agent")
return self._action
@action.setter
def action(self, val):
self._action = val
@property
def reward(self):
if self._reward is None:
raise Exception("The reward was not filled by any of the modules between the environment and the agent")
return self._reward
@reward.setter
def reward(self, val):
self._reward = val
@property
def total_return(self):
if self._total_return is None:
raise Exception("The total_return was not filled by any of the modules between the environment and the "
"agent. Make sure that you are using an episodic experience replay.")
return self._total_return
@total_return.setter
def total_return(self, val):
self._total_return = val
@property
def game_over(self):
if self._game_over is None:
raise Exception("The done flag was not filled by any of the modules between the environment and the agent")
return self._game_over
@game_over.setter
def game_over(self, val):
self._game_over = val
@property
def next_state(self):
if self._next_state is None:
raise Exception("The next state was not filled by any of the modules between the environment and the agent")
return self._next_state
@next_state.setter
def next_state(self, val):
self._next_state = val
def add_info(self, new_info: Dict[str, Any]) -> None:
if not new_info.keys().isdisjoint(self.info.keys()):
raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there "
"are overlapping keys between the two. old keys: {}, new keys: {}"
.format(self.info.keys(), new_info.keys()))
self.info.update(new_info)
def __copy__(self):
new_transition = type(self)()
new_transition.__dict__.update(self.__dict__)
new_transition.state = copy.copy(new_transition.state)
new_transition.next_state = copy.copy(new_transition.next_state)
new_transition.info = copy.copy(new_transition.info)
return new_transition
class EnvResponse(object):
def __init__(self, next_state: Dict[str, ObservationType], reward: RewardType, game_over: bool, info: Dict=None,
goal: ObservationType=None):
"""
An env response is a collection containing the information returning from the environment after a single action
has been performed on it.
:param next_state: The new state that the environment has transitioned into. Assumed to be a dictionary where the
observation is located at state['observation']
:param reward: The reward received from the environment
:param game_over: A boolean which should be True if the episode terminated after
the execution of the action.
:param info: any additional info from the environment
:param goal: a goal defined by the environment
"""
self._next_state = self.next_state = next_state
self._reward = self.reward = reward
self._game_over = self.game_over = game_over
self._goal = self.goal = goal
if info is None:
self.info = {}
else:
self.info = info
def __repr__(self):
return str(self.__dict__)
@property
def next_state(self):
return self._next_state
@next_state.setter
def next_state(self, val):
self._next_state = val
@property
def reward(self):
return self._reward
@reward.setter
def reward(self, val):
self._reward = val
@property
def game_over(self):
return self._game_over
@game_over.setter
def game_over(self, val):
self._game_over = val
@property
def goal(self):
return self._goal
@goal.setter
def goal(self, val):
self._goal = val
def add_info(self, info: Dict[str, Any]) -> None:
if info.keys().isdisjoint(self.info.keys()):
raise ValueError("The new info dictionary can not be appended to the existing info dictionary since there"
"are overlapping keys between the two")
self.info.update(info)
class ActionInfo(object):
"""
Action info is a class that holds an action and various additional information details about it
"""
def __init__(self, action: ActionType, action_probability: float=0,
action_value: float=0., state_value: float=0., max_action_value: float=None,
action_intrinsic_reward: float=0):
"""
:param action: the action
:param action_probability: the probability that the action was given when selecting it
:param action_value: the state-action value (Q value) of the action
:param state_value: the state value (V value) of the state where the action was taken
:param max_action_value: in case this is an action that was selected randomly, this is the value of the action
that received the maximum value. if no value is given, the action is assumed to be the
action with the maximum value
:param action_intrinsic_reward: can contain any intrinsic reward that the agent wants to add to this action
selection
"""
self.action = action
self.action_probability = action_probability
self.action_value = action_value
self.state_value = state_value
if not max_action_value:
self.max_action_value = action_value
else:
self.max_action_value = max_action_value
self.action_intrinsic_reward = action_intrinsic_reward
class Batch(object):
def __init__(self, transitions: List[Transition]):
"""
A wrapper around a list of transitions that helps extracting batches of parameters from it.
For example, one can extract a list of states corresponding to the list of transitions.
The class uses lazy evaluation in order to return each of the available parameters.
:param transitions: a list of transitions to extract the batch from
"""
self.transitions = transitions
self._states = {}
self._actions = None
self._rewards = None
self._total_returns = None
self._game_overs = None
self._next_states = {}
self._goals = None
self._info = {}
def slice(self, start, end) -> None:
"""
Keep a slice from the batch and discard the rest of the batch
:param start: the start index in the slice
:param end: the end index in the slice
:return: None
"""
self.transitions = self.transitions[start:end]
for k, v in self._states.items():
self._states[k] = v[start:end]
if self._actions is not None:
self._actions = self._actions[start:end]
if self._rewards is not None:
self._rewards = self._rewards[start:end]
if self._total_returns is not None:
self._total_returns = self._total_returns[start:end]
if self._game_overs is not None:
self._game_overs = self._game_overs[start:end]
for k, v in self._next_states.items():
self._next_states[k] = v[start:end]
if self._goals is not None:
self._goals = self._goals[start:end]
for k, v in self._info.items():
self._info[k] = v[start:end]
def shuffle(self) -> None:
"""
Shuffle all the transitions in the batch
:return: None
"""
batch_order = list(range(self.size))
shuffle(batch_order)
self.transitions = [self.transitions[i] for i in batch_order]
self._states = {}
self._actions = None
self._rewards = None
self._total_returns = None
self._game_overs = None
self._next_states = {}
self._goals = None
self._info = {}
# This seems to be slower
# for k, v in self._states.items():
# self._states[k] = [v[i] for i in batch_order]
# if self._actions is not None:
# self._actions = [self._actions[i] for i in batch_order]
# if self._rewards is not None:
# self._rewards = [self._rewards[i] for i in batch_order]
# if self._total_returns is not None:
# self._total_returns = [self._total_returns[i] for i in batch_order]
# if self._game_overs is not None:
# self._game_overs = [self._game_overs[i] for i in batch_order]
# for k, v in self._next_states.items():
# self._next_states[k] = [v[i] for i in batch_order]
# if self._goals is not None:
# self._goals = [self._goals[i] for i in batch_order]
# for k, v in self._info.items():
# self._info[k] = [v[i] for i in batch_order]
def states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
"""
follow the keys in fetches to extract the corresponding items from the states in the batch
if these keys were not already extracted before. return only the values corresponding to those keys
:param fetches: the keys of the state dictionary to extract
:param expand_dims: add an extra dimension to each of the value batches
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
"""
current_states = {}
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
# addition to the current_state, so that all the inputs of the network will be filled)
for key in set(fetches).intersection(self.transitions[0].state.keys()):
if key not in self._states.keys():
self._states[key] = np.array([np.array(transition.state[key]) for transition in self.transitions])
if expand_dims:
current_states[key] = np.expand_dims(self._states[key], -1)
else:
current_states[key] = self._states[key]
return current_states
def actions(self, expand_dims=False) -> np.ndarray:
"""
if the actions were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the actions batch
:return: a numpy array containing all the actions of the batch
"""
if self._actions is None:
self._actions = np.array([transition.action for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._actions, -1)
return self._actions
def rewards(self, expand_dims=False) -> np.ndarray:
"""
if the rewards were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the rewards batch
:return: a numpy array containing all the rewards of the batch
"""
if self._rewards is None:
self._rewards = np.array([transition.reward for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._rewards, -1)
return self._rewards
def total_returns(self, expand_dims=False) -> np.ndarray:
"""
if the total_returns were not converted to a batch before, extract them to a batch and then return the batch
if the total return was not filled, this will raise an exception
:param expand_dims: add an extra dimension to the total_returns batch
:return: a numpy array containing all the total return values of the batch
"""
if self._total_returns is None:
self._total_returns = np.array([transition.total_return for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._total_returns, -1)
return self._total_returns
def game_overs(self, expand_dims=False) -> np.ndarray:
"""
if the game_overs were not converted to a batch before, extract them to a batch and then return the batch
:param expand_dims: add an extra dimension to the game_overs batch
:return: a numpy array containing all the game over flags of the batch
"""
if self._game_overs is None:
self._game_overs = np.array([transition.game_over for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._game_overs, -1)
return self._game_overs
def next_states(self, fetches: List[str], expand_dims=False) -> Dict[str, np.ndarray]:
"""
follow the keys in fetches to extract the corresponding items from the next states in the batch
if these keys were not already extracted before. return only the values corresponding to those keys
:param fetches: the keys of the state dictionary to extract
:param expand_dims: add an extra dimension to each of the value batches
:return: a dictionary containing a batch of values correponding to each of the given fetches keys
"""
next_states = {}
# there are cases (e.g. ddpg) where the state does not contain all the information needed for running
# through the network and this has to be added externally (e.g. ddpg where the action needs to be given in
# addition to the current_state, so that all the inputs of the network will be filled)
for key in set(fetches).intersection(self.transitions[0].next_state.keys()):
if key not in self._next_states.keys():
self._next_states[key] = np.array([np.array(transition.next_state[key]) for transition in self.transitions])
if expand_dims:
next_states[key] = np.expand_dims(self._next_states[key], -1)
else:
next_states[key] = self._next_states[key]
return next_states
def goals(self, expand_dims=False) -> np.ndarray:
"""
if the goals were not converted to a batch before, extract them to a batch and then return the batch
if the goal was not filled, this will raise an exception
:param expand_dims: add an extra dimension to the goals batch
:return: a numpy array containing all the goals of the batch
"""
if self._goals is None:
self._goals = np.array([transition.goal for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._goals, -1)
return self._goals
def info(self, key, expand_dims=False) -> np.ndarray:
"""
if the given info dictionary key was not converted to a batch before, extract it to a batch and then return the
batch. if the key is not part of the keys in the info dictionary, this will raise an exception
:param expand_dims: add an extra dimension to the info batch
:return: a numpy array containing all the info values of the batch corresponding to the given key
"""
if key not in self._info.keys():
self._info[key] = np.array([transition.info[key] for transition in self.transitions])
if expand_dims:
return np.expand_dims(self._info[key], -1)
return self._info[key]
@property
def size(self) -> int:
"""
:return: the size of the batch
"""
return len(self.transitions)
def __getitem__(self, key):
"""
get an item from the transitions list
:param key: index of the transition in the batch
:return: the transition corresponding to the given index
"""
return self.transitions[key]
def __setitem__(self, key, item):
"""
set an item in the transition list
:param key: index of the transition in the batch
:param item: the transition to place in the given index
:return: None
"""
self.transitions[key] = item
class TotalStepsCounter(object):
"""
A wrapper around a dictionary counting different StepMethods steps done.
"""
def __init__(self):
self.counters = {
EnvironmentEpisodes: 0,
EnvironmentSteps: 0,
TrainingSteps: 0
}
def __getitem__(self, key: Type[StepMethod]) -> int:
"""
get counter value
:param key: counter type
:return: the counter value
"""
return self.counters[key]
def __setitem__(self, key: StepMethod, item: int) -> None:
"""
set an item in the transition list
:param key: counter type
:param item: an integer representing the new counter value
:return: None
"""
self.counters[key] = item
class GradientClippingMethod(Enum):
ClipByGlobalNorm = 0
ClipByNorm = 1
ClipByValue = 2
class Episode(object):
def __init__(self, discount: float=0.99, bootstrap_total_return_from_old_policy: bool=False, n_step: int=-1):
"""
:param discount: the discount factor to use when calculating total returns
:param bootstrap_total_return_from_old_policy: should the total return be bootstrapped from the values in the
memory
:param n_step: the number of future steps to sum the reward over before bootstrapping
"""
self.transitions = []
# a num_transitions x num_transitions table with the n step return in the n'th row
self.returns_table = None
self._length = 0
self.discount = discount
self.bootstrap_total_return_from_old_policy = bootstrap_total_return_from_old_policy
self.n_step = n_step
self.is_complete = False
def insert(self, transition):
self.transitions.append(transition)
self._length += 1
def is_empty(self):
return self.length() == 0
def length(self):
return self._length
def get_transition(self, transition_idx):
return self.transitions[transition_idx]
def get_last_transition(self):
return self.get_transition(-1) if self.length() > 0 else None
def get_first_transition(self):
return self.get_transition(0) if self.length() > 0 else None
def update_returns(self):
if self.n_step == -1 or self.n_step > self.length():
self.n_step = self.length()
rewards = np.array([t.reward for t in self.transitions])
rewards = rewards.astype('float')
total_return = rewards.copy()
current_discount = self.discount
for i in range(1, self.n_step):
total_return += current_discount * np.pad(rewards[i:], (0, i), 'constant', constant_values=0)
current_discount *= self.discount
# calculate the bootstrapped returns
if self.bootstrap_total_return_from_old_policy:
bootstraps = np.array([np.squeeze(t.info['max_action_value']) for t in self.transitions[self.n_step:]])
bootstrapped_return = total_return + current_discount * np.pad(bootstraps, (0, self.n_step), 'constant',
constant_values=0)
total_return = bootstrapped_return
for transition_idx in range(self.length()):
self.transitions[transition_idx].total_return = total_return[transition_idx]
def update_actions_probabilities(self):
probability_product = 1
for transition_idx, transition in enumerate(self.transitions):
if 'action_probabilities' in transition.info.keys():
probability_product *= transition.info['action_probabilities']
for transition_idx, transition in enumerate(self.transitions):
transition.info['probability_product'] = probability_product
def get_returns_table(self):
return self.returns_table
def get_returns(self):
return self.get_transitions_attribute('total_return')
def get_transitions_attribute(self, attribute_name):
if len(self.transitions) > 0 and hasattr(self.transitions[0], attribute_name):
return [getattr(t, attribute_name) for t in self.transitions]
elif len(self.transitions) == 0:
return []
else:
raise ValueError("The transitions have no such attribute name")
def to_batch(self):
batch = []
for i in range(self.length()):
batch.append(self.get_transition(i))
return batch

73
rl_coach/dashboard.py Normal file
View File

@@ -0,0 +1,73 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
To run Coach Dashboard, run the following command:
python3 dashboard.py
"""
import sys
sys.path.append('.')
import os
from rl_coach.dashboard_components.experiment_board import display_directory_group, display_files
from rl_coach.dashboard_components.globals import doc
import rl_coach.dashboard_components.boards
from rl_coach.dashboard_components.landing_page import landing_page
doc.add_root(landing_page)
import argparse
import glob
parser = argparse.ArgumentParser()
parser.add_argument('-d', '--experiment_dir',
help="(string) The path of an experiment dir to open",
default=None,
type=str)
parser.add_argument('-f', '--experiment_files',
help="(string) The path of an experiment file to open",
default=None,
type=str)
args = parser.parse_args()
if args.experiment_dir:
doc.add_timeout_callback(lambda: display_directory_group(args.experiment_dir), 1000)
elif args.experiment_files:
files = []
for file_pattern in args.experiment_files:
files.extend(glob.glob(args.experiment_files))
doc.add_timeout_callback(lambda: display_files(files), 1000)
def main():
from rl_coach.utils import get_open_port
dashboard_path = os.path.realpath(__file__)
command = 'bokeh serve --show {} --port {}'.format(dashboard_path, get_open_port())
if args.experiment_dir or args.experiment_files:
command += ' --args'
if args.experiment_dir:
command += ' --experiment_dir {}'.format(args.experiment_dir)
if args.experiment_files:
command += ' --experiment_files {}'.format(args.experiment_files)
os.system(command)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,21 @@
from bokeh.layouts import column
from bokeh.models.widgets import Panel, Tabs
from rl_coach.dashboard_components.experiment_board import experiment_board_layout
from rl_coach.dashboard_components.episodic_board import episodic_board_layout
from rl_coach.dashboard_components.globals import spinner, layouts
from bokeh.models.widgets import Div
# ---------------- Build Website Layout -------------------
# title
title = Div(text="""<h1>Coach Dashboard</h1>""")
center = Div(text="""<style>html { padding-left: 50px; } </style>""")
tab1 = Panel(child=experiment_board_layout, title='experiment board')
# tab2 = Panel(child=episodic_board_layout, title='episodic board')
# tabs = Tabs(tabs=[tab1, tab2])
tabs = Tabs(tabs=[tab1])
layout = column(title, center, tabs)
layout = column(layout, spinner)
layouts['boards'] = layout

View File

@@ -0,0 +1,99 @@
from bokeh.layouts import row, column, widgetbox, Spacer
from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend
from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup, Toggle
from bokeh.plotting import figure
from rl_coach.dashboard_components.globals import layouts, crcolor, crx, cry, color_resolution, crRGBs
from rl_coach.dashboard_components.experiment_board import file_selection_button, files_selector_spacer, \
group_selection_button, unload_file_button, files_selector
# ---------------- Build Website Layout -------------------
# file refresh time placeholder
refresh_info = Div(text="""""", width=210)
# create figures
plot = figure(plot_width=1200, plot_height=800,
tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
toolbar_location='above', x_axis_label='Episodes',
x_range=Range1d(0, 10000), y_range=Range1d(0, 100000))
plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
plot.yaxis[-1].visible = False
# legend
div = Div(text="""""")
legend = widgetbox([div])
bokeh_legend = Legend(
# items=[("12345678901234567890123456789012345678901234567890", [])], # 50 letters
items=[("__________________________________________________", [])], # 50 letters
location=(0, 0), orientation="vertical",
border_line_color="black",
label_text_font_size={'value': '9pt'},
margin=30
)
plot.add_layout(bokeh_legend, "right")
# select file
file_selection_button = Button(label="Select Files", button_type="success", width=120)
# file_selection_button.on_click(load_files_group)
files_selector_spacer = Spacer(width=10)
group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
# group_selection_button.on_click(load_directory_group)
unload_file_button = Button(label="Unload", button_type="danger", width=50)
# unload_file_button.on_click(unload_file)
# files selection box
files_selector = Select(title="Files:", options=[])
# files_selector.on_change('value', change_data_selector)
# data selection box
data_selector = MultiSelect(title="Data:", options=[], size=12)
# data_selector.on_change('value', select_data)
# toggle second axis button
toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
# toggle_second_axis_button.on_click(toggle_second_axis)
# averaging slider
averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10)
# averaging_slider.on_change('value', update_averaging)
# color selector
color_selector_title = Div(text="""Select Color:""")
crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
plot_width=300, plot_height=40,
tools='tap')
color_selector.axis.visible = False
color_range = color_selector.rect(x='x', y='y', width=1, height=10,
color='crcolor', source=crsource)
# crsource.on_change('selected', select_color)
color_range.nonselection_glyph = color_range.glyph
color_selector.toolbar.logo = None
color_selector.toolbar_location = None
episode_selector = MultiSelect(title="Episode:", options=['0', '1', '2', '3', '4'], size=1)
online_toggle = Toggle(label="Online", button_type="success")
# main layout of the document
layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
layout = column(layout, files_selector)
layout = column(layout, row(refresh_info, unload_file_button))
layout = column(layout, data_selector)
layout = column(layout, color_selector_title)
layout = column(layout, color_selector)
layout = column(layout, toggle_second_axis_button)
layout = column(layout, averaging_slider)
layout = column(layout, episode_selector)
layout = column(layout, online_toggle)
layout = row(layout, plot)
episodic_board_layout = layout
layouts["episodic_board"] = episodic_board_layout

View File

@@ -0,0 +1,564 @@
import copy
import datetime
import os
import sys
import time
from itertools import cycle
from os import listdir
from os.path import isfile, join, isdir
from bokeh.layouts import row, column, Spacer, ToolbarBox
from bokeh.models import ColumnDataSource, Range1d, LinearAxis, Legend, \
WheelZoomTool, CrosshairTool, ResetTool, SaveTool, Toolbar, PanTool, BoxZoomTool, \
Toggle
from bokeh.models.callbacks import CustomJS
from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup
from bokeh.plotting import figure
from rl_coach.dashboard_components.globals import signals_files, x_axis_labels, x_axis_options, show_spinner, hide_spinner, \
dialog, FolderType, RunType, add_directory_csv_files, doc, display_boards, layouts, \
crcolor, crx, cry, color_resolution, crRGBs, rgb_to_hex, x_axis
from rl_coach.dashboard_components.signals_files_group import SignalsFilesGroup
from rl_coach.dashboard_components.signals_file import SignalsFile
def update_axis_range(name, range_placeholder):
max_val = -float('inf')
min_val = float('inf')
selected_signal = None
if name in x_axis_options:
selected_signal = name
for signals_file in signals_files.values():
curr_min_val, curr_max_val = signals_file.get_range_of_selected_signals_on_axis(name, selected_signal)
max_val = max(max_val, curr_max_val)
min_val = min(min_val, curr_min_val)
if min_val != float('inf'):
if min_val == max_val:
range = 5
else:
range = max_val - min_val
range_placeholder.start = min_val - 0.1 * range
range_placeholder.end = max_val + 0.1 * range
# update axes ranges
def update_y_axis_ranges():
update_axis_range('default', plot.y_range)
update_axis_range('secondary', plot.extra_y_ranges['secondary'])
def update_x_axis_ranges():
update_axis_range(x_axis[0], plot.x_range)
def get_all_selected_signals():
signals = []
for signals_file in signals_files.values():
signals += signals_file.get_selected_signals()
return signals
# update legend using the legend text dictionary
def update_legend():
selected_signals = get_all_selected_signals()
max_line_length = 50
items = []
for signal in selected_signals:
side_sign = "" if signal.axis == 'default' else ""
signal_name = side_sign + " " + signal.full_name
# bokeh legend does not respect a max_width parameter so we split the text manually to lines of constant width
signal_name = [signal_name[n:n + max_line_length] for n in range(0, len(signal_name), max_line_length)]
for idx, substr in enumerate(signal_name):
if idx == 0:
lines = [signal.line]
if signal.show_bollinger_bands:
lines.append(signal.bands)
items.append((substr, lines))
else:
items.append((substr, []))
if bokeh_legend.items == [] or items == [] or \
any([legend_item.renderers != item[1] for legend_item, item in zip(bokeh_legend.items, items)])\
or any([legend_item.label != item[0] for legend_item, item in zip(bokeh_legend.items, items)]):
bokeh_legend.items = items # this step takes a long time because it is redrawing the plot
# the visible=false => visible=true is a hack to make the legend render again
bokeh_legend.visible = False
bokeh_legend.visible = True
# select lines to display
def select_data(args, old, new):
if selected_file is None:
return
show_spinner("Updating the signal selection...")
selected_signals = new
for signal_name in selected_file.signals.keys():
is_selected = signal_name in selected_signals
selected_file.set_signal_selection(signal_name, is_selected)
# update axes ranges
update_y_axis_ranges()
update_x_axis_ranges()
# update the legend
update_legend()
hide_spinner()
# add new lines to the plot
def plot_signals(signals_file, signals):
for idx, signal in enumerate(signals):
signal.line = plot.line('index', signal.name, source=signals_file.bokeh_source,
line_color=signal.color, line_width=2)
def open_file_dialog():
return dialog.getFileDialog()
def open_directory_dialog():
return dialog.getDirDialog()
# will create a group from the files
def create_files_group_signal(files):
global selected_file
signals_file = SignalsFilesGroup(files, plot)
signals_files[signals_file.filename] = signals_file
filenames = [signals_file.filename]
if files_selector.options[0] == "":
files_selector.options = filenames
else:
files_selector.options = files_selector.options + filenames
files_selector.value = filenames[0]
selected_file = signals_file
# load files from disk as a group
def load_files_group():
show_spinner("Loading files group...")
files = open_file_dialog()
# no files selected
if not files or not files[0]:
hide_spinner()
return
display_boards()
if len(files) == 1:
create_files_signal(files)
else:
create_files_group_signal(files)
change_selected_signals_in_data_selector([""])
hide_spinner()
# classify the folder as containing a single file, multiple files or only folders
def classify_folder(dir_path):
files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')]
folders = [d for d in listdir(dir_path) if isdir(join(dir_path, d)) and any(f.endswith(".csv") for f in os.listdir(join(dir_path, d)))]
if len(files) == 1:
return FolderType.SINGLE_FILE
elif len(files) > 1:
return FolderType.MULTIPLE_FILES
elif len(folders) == 1:
return classify_folder(join(dir_path, folders[0]))
elif len(folders) > 1:
return FolderType.MULTIPLE_FOLDERS
else:
return FolderType.EMPTY
# finds if this is single-threaded or multi-threaded
def get_run_type(dir_path):
folder_type = classify_folder(dir_path)
if folder_type == FolderType.SINGLE_FILE:
folder_type = RunType.SINGLE_FOLDER_SINGLE_FILE
elif folder_type == FolderType.MULTIPLE_FILES:
folder_type = RunType.SINGLE_FOLDER_MULTIPLE_FILES
elif folder_type == FolderType.MULTIPLE_FOLDERS:
# folder contains sub dirs -> we assume we can classify the folder using only the first sub dir
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
# checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the
# same structure (i.e. if one is a result of multi-threaded run, so will all the other).
folder_type = classify_folder(os.path.join(dir_path, sub_dirs[0]))
if folder_type == FolderType.SINGLE_FILE:
folder_type = RunType.MULTIPLE_FOLDERS_SINGLE_FILES
elif folder_type == FolderType.MULTIPLE_FILES:
folder_type = RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES
return folder_type
# create a signal file from the directory path according to the directory underlying structure
def handle_dir(dir_path, run_type):
paths = add_directory_csv_files(dir_path)
if run_type in [RunType.SINGLE_FOLDER_MULTIPLE_FILES,
RunType.MULTIPLE_FOLDERS_SINGLE_FILES]:
create_files_group_signal(paths)
elif run_type == RunType.SINGLE_FOLDER_SINGLE_FILE:
create_files_signal(paths, use_dir_name=True)
elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES:
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
create_files_group_signal([os.path.join(dir_path, d) for d in sub_dirs])
# load directory from disk as a group
def load_directory_group():
show_spinner("Loading directories group...")
directory = open_directory_dialog()
# no files selected
if not directory:
hide_spinner()
return
display_directory_group(directory)
def display_directory_group(directory):
pause_auto_update()
display_boards()
show_spinner("Loading directories group...")
while get_run_type(directory) == FolderType.EMPTY:
show_spinner("Waiting for experiment directory to get populated...")
sys.stdout.write("Waiting for experiment directory to get populated...\r")
time.sleep(10)
handle_dir(directory, get_run_type(directory))
change_selected_signals_in_data_selector([""])
resume_auto_update_according_to_toggle()
hide_spinner()
def create_files_signal(files, use_dir_name=False):
global selected_file
new_signal_files = []
for idx, file_path in enumerate(files):
signals_file = SignalsFile(str(file_path), plot=plot, use_dir_name=use_dir_name)
signals_files[signals_file.filename] = signals_file
new_signal_files.append(signals_file)
filenames = [f.filename for f in new_signal_files]
if files_selector.options[0] == "":
files_selector.options = filenames
else:
files_selector.options = files_selector.options + filenames
files_selector.value = filenames[0]
selected_file = new_signal_files[0]
# load files from disk
def load_files():
show_spinner("Loading files...")
files = open_file_dialog()
# no files selected
if not files or not files[0]:
hide_spinner()
return
display_files(files)
def display_files(files):
pause_auto_update()
display_boards()
show_spinner("Loading files...")
create_files_signal(files)
change_selected_signals_in_data_selector([""])
resume_auto_update_according_to_toggle()
hide_spinner()
def unload_file():
global selected_file
if selected_file is None:
return
selected_file.hide_all_signals()
del signals_files[selected_file.filename]
data_selector.options = [""]
filenames_list = copy.copy(files_selector.options)
filenames_list.remove(selected_file.filename)
if len(filenames_list) == 0:
filenames_list = [""]
files_selector.options = filenames_list
filenames = cycle(filenames_list)
if files_selector.options[0] != "":
files_selector.value = next(filenames)
else:
files_selector.value = None
update_legend()
refresh_info.text = ""
if len(signals_files) == 0:
selected_file = None
# reload the selected csv file
def reload_all_files(force=False):
pause_auto_update()
for file_to_load in signals_files.values():
if force or file_to_load.file_was_modified_on_disk():
show_spinner("Updating files from the disk...")
file_to_load.load()
hide_spinner()
refresh_info.text = "Last Update: " + str(datetime.datetime.now()).split(".")[0]
resume_auto_update_according_to_toggle()
# unselect the currently selected signals and then select the requested signals in the data selector
def change_selected_signals_in_data_selector(selected_signals):
# the default bokeh way is not working due to a bug since Bokeh 0.12.6 (https://github.com/bokeh/bokeh/issues/6501)
# remove the data selection callback before updating the selector
data_selector.remove_on_change('value', select_data)
for value in list(data_selector.value):
if value in data_selector.options:
index = data_selector.options.index(value)
data_selector.options.remove(value)
data_selector.value.remove(value)
data_selector.options.insert(index, value)
data_selector.value = selected_signals
# add back the data selection callback
data_selector.on_change('value', select_data)
# change data options according to the selected file
def change_data_selector(args, old, new):
global selected_file
if new is None:
selected_file = None
return
show_spinner("Updating selection...")
selected_file = signals_files[new]
if isinstance(selected_file, SignalsFile):
group_cb.disabled = True
elif isinstance(selected_file, SignalsFilesGroup):
group_cb.disabled = False
data_selector.remove_on_change('value', select_data)
data_selector.options = sorted(list(selected_file.signals.keys()))
data_selector.on_change('value', select_data)
selected_signal_names = [s.name for s in selected_file.signals.values() if s.selected]
if not selected_signal_names:
selected_signal_names = [""]
change_selected_signals_in_data_selector(selected_signal_names)
averaging_slider.value = selected_file.signals_averaging_window
if len(averaging_slider_dummy_source.data['value']) > 0:
averaging_slider_dummy_source.data['value'][0] = selected_file.signals_averaging_window
group_cb.active = [0 if selected_file.show_bollinger_bands else None]
group_cb.active += [1 if selected_file.separate_files else None]
hide_spinner()
# smooth all the signals of the selected file
def update_averaging(args, old, new):
show_spinner("Smoothing the signals...")
# get the actual value from the dummy source
new = averaging_slider_dummy_source.data['value'][0]
selected_file.change_averaging_window(new)
hide_spinner()
def change_x_axis(val):
global x_axis
show_spinner("Updating the X axis...")
x_axis[0] = x_axis_options[val]
plot.xaxis.axis_label = x_axis_labels[val]
for file_to_load in signals_files.values():
file_to_load.update_x_axis_index()
# this is needed in order to recalculate the mean of all the files
if isinstance(file_to_load, SignalsFilesGroup):
file_to_load.load()
update_axis_range(x_axis[0], plot.x_range)
hide_spinner()
# move the signal between the main and secondary Y axes
def toggle_second_axis():
show_spinner("Switching the Y axis...")
plot.yaxis[-1].visible = True
selected_file.toggle_y_axis()
# this is just for redrawing the signals
selected_file.reload_data()
update_y_axis_ranges()
update_legend()
hide_spinner()
def toggle_group_property(new):
show_spinner("Loading...")
# toggle show / hide Bollinger bands
selected_file.change_bollinger_bands_state(0 in new)
# show a separate signal for each file in a group
selected_file.show_files_separately(1 in new)
update_legend()
hide_spinner()
# Color selection - most of these functions are taken from bokeh examples (plotting/color_sliders.py)
def select_color(attr, old, new):
show_spinner("Changing signal color...")
signals = selected_file.get_selected_signals()
for signal in signals:
signal.set_color(rgb_to_hex(crRGBs[new['1d']['indices'][0]]))
hide_spinner()
def pause_auto_update():
toggle_auto_update(False)
def resume_auto_update_according_to_toggle():
toggle_auto_update(auto_update_toggle_button.active)
def toggle_auto_update(new):
global file_update_callback
if new is False and file_update_callback in doc._session_callbacks:
doc.remove_periodic_callback(file_update_callback)
elif file_update_callback not in doc._session_callbacks:
file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
file_update_callback = doc.add_periodic_callback(reload_all_files, 30000)
# ---------------- Build Website Layout -------------------
# file refresh time placeholder
refresh_info = Div(text="""""", width=210)
# create figures
plot = figure(plot_width=1200, plot_height=800,
# tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
toolbar_location=None, x_axis_label='Episodes',
x_range=Range1d(0, 10000), y_range=Range1d(0, 100000), lod_factor=1000)
plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
toolbar = Toolbar(tools=[PanTool(), BoxZoomTool(), WheelZoomTool(), CrosshairTool(), ResetTool(), SaveTool()])
# plot.toolbar = toolbar
plot.add_tools(*toolbar.tools)
plot.yaxis[-1].visible = False
bokeh_legend = Legend(
items=[("", [])],
orientation="vertical",
border_line_color="black",
label_text_font_size={'value': '9pt'},
click_policy='hide',
visible=False
)
bokeh_legend.label_width = 100
plot.add_layout(bokeh_legend, "right")
plot.y_range = Range1d(0, 100)
plot.extra_y_ranges['secondary'] = Range1d(0, 100)
# select file
file_selection_button = Button(label="Select Files", button_type="success", width=120)
file_selection_button.on_click(load_files_group)
files_selector_spacer = Spacer(width=10)
group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
group_selection_button.on_click(load_directory_group)
update_files_button = Button(label="Update Files", button_type="default", width=50)
update_files_button.on_click(reload_all_files)
auto_update_toggle_button = Toggle(label="Auto Update", button_type="default", width=50, active=True)
auto_update_toggle_button.on_click(toggle_auto_update)
unload_file_button = Button(label="Unload", button_type="danger", width=50)
unload_file_button.on_click(unload_file)
# files selection box
files_selector = Select(title="Files:", options=[""])
files_selector.on_change('value', change_data_selector)
# data selection box
data_selector = MultiSelect(title="Data:", options=[], size=12)
data_selector.on_change('value', select_data)
# x axis selection box
x_axis_selector_title = Div(text="""X Axis:""", height=10)
x_axis_selector = RadioButtonGroup(labels=x_axis_options, active=0)
x_axis_selector.on_click(change_x_axis)
# toggle second axis button
toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
toggle_second_axis_button.on_click(toggle_second_axis)
# averaging slider
# This data source is just used to communicate / trigger the real callback
averaging_slider_dummy_source = ColumnDataSource(data=dict(value=[]))
averaging_slider_dummy_source.on_change('data', update_averaging)
averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10, callback_policy='mouseup')
averaging_slider.callback = CustomJS(args=dict(source=averaging_slider_dummy_source), code="""
source.data = { value: [cb_obj.value] }
""")
# group properties checkbox
group_cb = CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[])
group_cb.on_click(toggle_group_property)
# color selector
color_selector_title = Div(text="""Select Color:""")
crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
plot_width=300, plot_height=40,
tools='tap')
color_selector.axis.visible = False
color_range = color_selector.rect(x='x', y='y', width=1, height=10,
color='crcolor', source=crsource)
crsource.on_change('selected', select_color)
color_range.nonselection_glyph = color_range.glyph
color_selector.toolbar.logo = None
color_selector.toolbar_location = None
# main layout of the document
layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
layout = column(layout, files_selector)
layout = column(layout, row(update_files_button, Spacer(width=50), auto_update_toggle_button,
Spacer(width=50), unload_file_button))
layout = column(layout, row(refresh_info))
layout = column(layout, data_selector)
layout = column(layout, color_selector_title)
layout = column(layout, color_selector)
layout = column(layout, x_axis_selector_title)
layout = column(layout, x_axis_selector)
layout = column(layout, group_cb)
layout = column(layout, toggle_second_axis_button)
layout = column(layout, averaging_slider)
toolbox = ToolbarBox(toolbar=toolbar, toolbar_location='above')
panel = column(toolbox, plot)
layout = row(layout, panel)
experiment_board_layout = layout
layouts["experiment_board"] = experiment_board_layout

View File

@@ -0,0 +1,136 @@
import os
from genericpath import isdir, isfile
from os import listdir
from os.path import join
from enum import Enum
from bokeh.models import Div
from bokeh.plotting import curdoc
import wx
import colorsys
patches = {}
signals_files = {}
selected_file = None
x_axis = ['Episode #']
x_axis_options = ['Episode #', 'Total steps', 'Wall-Clock Time']
x_axis_labels = ['Episode #', 'Total steps (per worker)', 'Wall-Clock Time (minutes)']
current_color = 0
# spinner
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
with open(os.path.join(root_dir, 'dashboard_components/spinner.css'), 'r') as f:
spinner_style = """<style>{}</style>""".format(f.read())
spinner_html = """<ul class="spinner"><li></li><li></li><li></li><li></li>
<li>
<br>
<span style="font-size: 24px; font-weight: bold; margin-left: -175px; width: 400px;
position: absolute; text-align: center;">
{}
</span>
</li></ul>"""
spinner = Div(text="""""")
displayed_doc = "landing_page"
layouts = {}
def generate_color_range(N, I):
HSV_tuples = [(x*1.0/N, 0.5, I) for x in range(N)]
RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
for_conversion = []
for RGB_tuple in RGB_tuples:
for_conversion.append((int(RGB_tuple[0]*255), int(RGB_tuple[1]*255), int(RGB_tuple[2]*255)))
hex_colors = [rgb_to_hex(RGB_tuple) for RGB_tuple in for_conversion]
return hex_colors, for_conversion
# convert RGB tuple to hexadecimal code
def rgb_to_hex(rgb):
return '#%02x%02x%02x' % rgb
# convert hexadecimal to RGB tuple
def hex_to_dec(hex):
red = ''.join(hex.strip('#')[0:2])
green = ''.join(hex.strip('#')[2:4])
blue = ''.join(hex.strip('#')[4:6])
return int(red, 16), int(green, 16), int(blue,16)
color_resolution = 1000
brightness = 0.75 # change to have brighter/darker colors
crx = list(range(1, color_resolution+1)) # the resolution is 1000 colors
cry = [5 for i in range(len(crx))]
crcolor, crRGBs = generate_color_range(color_resolution, brightness) # produce spectrum
def display_boards():
global displayed_doc
if displayed_doc == "landing_page":
doc.remove_root(doc.roots[0])
doc.add_root(layouts["boards"])
displayed_doc = "boards"
def show_spinner(text="Loading..."):
spinner.text = spinner_style + spinner_html.format(text)
def hide_spinner():
spinner.text = ""
# takes path to dir and recursively adds all it's files to paths
def add_directory_csv_files(dir_path, paths=None):
if not paths:
paths = []
for p in listdir(dir_path):
path = join(dir_path, p)
if isdir(path):
# call recursively for each dir
paths = add_directory_csv_files(path, paths)
elif isfile(path) and path.endswith('.csv'):
# add every file to the list
paths.append(path)
return paths
class DialogApp(wx.App):
def getFileDialog(self):
with wx.FileDialog(None, "Open CSV file", wildcard="CSV files (*.csv)|*.csv",
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR | wx.FD_MULTIPLE) as fileDialog:
if fileDialog.ShowModal() == wx.ID_CANCEL:
return None # the user changed their mind
else:
# Proceed loading the file chosen by the user
return fileDialog.GetPaths()
def getDirDialog(self):
with wx.DirDialog(None, "Choose input directory", "",
style=wx.FD_OPEN | wx.FD_FILE_MUST_EXIST | wx.FD_CHANGE_DIR) as dirDialog:
if dirDialog.ShowModal() == wx.ID_CANCEL:
return None # the user changed their mind
else:
# Proceed loading the dir chosen by the user
return dirDialog.GetPath()
class RunType(Enum):
SINGLE_FOLDER_SINGLE_FILE = 1
SINGLE_FOLDER_MULTIPLE_FILES = 2
MULTIPLE_FOLDERS_SINGLE_FILES = 3
MULTIPLE_FOLDERS_MULTIPLE_FILES = 4
UNKNOWN = 0
class FolderType(Enum):
SINGLE_FILE = 1
MULTIPLE_FILES = 2
MULTIPLE_FOLDERS = 3
EMPTY = 4
dialog = DialogApp()
doc = curdoc()

View File

@@ -0,0 +1,22 @@
from bokeh.layouts import row, column
from bokeh.models.widgets import Div
from rl_coach.dashboard_components.experiment_board import file_selection_button, group_selection_button
from rl_coach.dashboard_components.globals import layouts
# title
title = Div(text="""<h1>Coach Dashboard</h1>""")
# landing page
landing_page_description = Div(text="""<h3>Start by selecting an experiment file or directory to open:</h3>""")
center = Div(text="""<style>html { text-align: center; } </style>""")
center_buttons = Div(text="""<style>.bk-root .bk-widget { margin: 0 auto; }</style>""", width=0)
landing_page = column(center,
title,
landing_page_description,
row(center_buttons),
row(file_selection_button, sizing_mode='scale_width'),
row(group_selection_button, sizing_mode='scale_width'),
sizing_mode='scale_width')
layouts['landing_page'] = landing_page

View File

@@ -0,0 +1,125 @@
import random
import numpy as np
from bokeh.models import ColumnDataSource
from bokeh.palettes import Dark2
from rl_coach.dashboard_components.globals import show_spinner, hide_spinner, current_color
from rl_coach.utils import squeeze_list
class Signal:
def __init__(self, name, parent, plot):
self.name = name
self.full_name = "{}/{}".format(parent.filename, self.name)
self.plot = plot
self.selected = False
self.color = random.choice(Dark2[8])
self.line = None
self.scatter = None
self.bands = None
self.bokeh_source = parent.bokeh_source
self.min_val = 0
self.max_val = 0
self.axis = 'default'
self.sub_signals = []
for name in self.bokeh_source.data.keys():
if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name:
self.sub_signals.append(name)
if len(self.sub_signals) > 1:
self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]])
self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]])
self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]])
self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]])
else:
self.mean_signal = squeeze_list(self.name)
self.stdev_signal = None
self.min_signal = None
self.max_signal = None
self.has_bollinger_bands = False
if self.mean_signal and self.stdev_signal and self.min_signal and self.max_signal:
self.has_bollinger_bands = True
self.show_bollinger_bands = False
self.bollinger_bands_source = None
self.update_range()
def set_color(self, color):
self.color = color
if self.line:
self.line.glyph.line_color = color
if self.bands:
self.bands.glyph.fill_color = color
def plot_line(self):
global current_color
self.set_color(Dark2[8][current_color])
current_color = (current_color + 1) % len(Dark2[8])
if self.has_bollinger_bands:
self.set_bands_source()
self.create_bands()
self.line = self.plot.line('index', self.mean_signal, source=self.bokeh_source,
line_color=self.color, line_width=2)
# self.scatter = self.plot.scatter('index', self.mean_signal, source=self.bokeh_source)
self.line.visible = True
def set_selected(self, val):
if self.selected != val:
self.selected = val
if self.line:
# self.set_color(Dark2[8][current_color])
# current_color = (current_color + 1) % len(Dark2[8])
self.line.visible = self.selected
if self.bands:
self.bands.visible = self.selected and self.show_bollinger_bands
elif self.selected:
# lazy plotting - plot only when selected for the first time
self.plot_line()
def set_dash(self, dash):
self.line.glyph.line_dash = dash
def create_bands(self):
self.bands = self.plot.patch(x='band_x', y='band_y', source=self.bollinger_bands_source,
color=self.color, fill_alpha=0.4, alpha=0.1, line_width=0)
self.bands.visible = self.show_bollinger_bands
# self.min_line = plot.line('index', self.min_signal, source=self.bokeh_source,
# line_color=self.color, line_width=3, line_dash="4 4")
# self.max_line = plot.line('index', self.max_signal, source=self.bokeh_source,
# line_color=self.color, line_width=3, line_dash="4 4")
# self.min_line.visible = self.show_bollinger_bands
# self.max_line.visible = self.show_bollinger_bands
def set_bands_source(self):
x_ticks = self.bokeh_source.data['index']
mean_values = self.bokeh_source.data[self.mean_signal]
stdev_values = self.bokeh_source.data[self.stdev_signal]
band_x = np.append(x_ticks, x_ticks[::-1])
band_y = np.append(mean_values - stdev_values, mean_values[::-1] + stdev_values[::-1])
source_data = {'band_x': band_x, 'band_y': band_y}
if self.bollinger_bands_source:
self.bollinger_bands_source.data = source_data
else:
self.bollinger_bands_source = ColumnDataSource(source_data)
def change_bollinger_bands_state(self, new_state):
self.show_bollinger_bands = new_state
if self.bands and self.selected:
self.bands.visible = new_state
# self.min_line.visible = new_state
# self.max_line.visible = new_state
def update_range(self):
self.min_val = np.min(self.bokeh_source.data[self.mean_signal])
self.max_val = np.max(self.bokeh_source.data[self.mean_signal])
def set_axis(self, axis):
self.axis = axis
if not self.line:
self.plot_line()
self.line.visible = False
self.line.y_range_name = axis
def toggle_axis(self):
if self.axis == 'default':
self.set_axis('secondary')
else:
self.set_axis('default')

View File

@@ -0,0 +1,63 @@
import os
from os.path import basename
import pandas as pd
from pandas.errors import EmptyDataError
from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
from rl_coach.dashboard_components.globals import x_axis_options
from rl_coach.utils import break_file_path
class SignalsFile(SignalsFileBase):
def __init__(self, csv_path, load=True, plot=None, use_dir_name=False):
super().__init__(plot)
self.use_dir_name = use_dir_name
self.full_csv_path = csv_path
self.dir, self.filename, _ = break_file_path(csv_path)
if use_dir_name:
parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_path), '..'))
if len(os.listdir(parent_directory_path)) == 1:
# get the parent directory name (since the current directory is the timestamp directory)
self.dir = parent_directory_path
self.filename = basename(self.dir)
else:
# get the common directory for all the experiments
self.dir = os.path.dirname(csv_path)
self.filename = "{}/{}".format(basename(parent_directory_path), basename(self.dir))
if load:
self.load()
# this helps set the correct x axis
self.change_averaging_window(1, force=True)
def load_csv(self, idx=None, result=None):
# load csv and fix sparse data.
# csv can be in the middle of being written so we use try - except
new_csv = None
while new_csv is None:
try:
new_csv = pd.read_csv(self.full_csv_path)
break
except EmptyDataError:
new_csv = None
continue
new_csv['Wall-Clock Time'] /= 60.
new_csv = new_csv.interpolate()
# remove signals which don't contain any values
for k, v in new_csv.isna().all().items():
if v and k not in x_axis_options:
del new_csv[k]
new_csv.fillna(value=0, inplace=True)
self.csv = new_csv
self.last_modified = os.path.getmtime(self.full_csv_path)
if idx is not None:
result[idx] = (self.csv, self.last_modified)
def file_was_modified_on_disk(self):
return self.last_modified != os.path.getmtime(self.full_csv_path)

View File

@@ -0,0 +1,129 @@
import numpy as np
from bokeh.models import ColumnDataSource
from rl_coach.dashboard_components.signals import Signal
from rl_coach.dashboard_components.globals import x_axis, x_axis_options, show_spinner
class SignalsFileBase:
def __init__(self, plot):
self.plot = plot
self.full_csv_path = ""
self.dir = ""
self.filename = ""
self.signals_averaging_window = 1
self.show_bollinger_bands = False
self.csv = None
self.bokeh_source = None
self.bokeh_source_orig = None
self.last_modified = None
self.signals = {}
self.separate_files = False
self.last_reload_data_fix = False
def load_csv(self):
pass
def update_x_axis_index(self):
global x_axis
self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis[0]]
self.bokeh_source.data['index'] = self.bokeh_source.data[x_axis[0]]
def toggle_y_axis(self, signal_name=None):
if signal_name and signal_name in self.signals.keys():
self.signals[signal_name].toggle_axis()
else:
for signal in self.signals.values():
if signal.selected:
signal.toggle_axis()
def update_source_and_signals(self):
# create bokeh data sources
self.bokeh_source_orig = ColumnDataSource(self.csv)
if self.bokeh_source is None:
self.bokeh_source = ColumnDataSource(self.csv)
self.update_x_axis_index()
else:
self.update_x_axis_index()
# smooth the data if necessary
self.change_averaging_window(self.signals_averaging_window, force=True)
# create all the signals
if len(self.signals.keys()) == 0:
self.signals = {}
unique_signal_names = []
for name in self.csv.columns:
if len(name.split('/')) == 1:
unique_signal_names.append(name)
else:
unique_signal_names.append('/'.join(name.split('/')[:-1]))
unique_signal_names = list(set(unique_signal_names))
for signal_name in unique_signal_names:
self.signals[signal_name] = Signal(signal_name, self, self.plot)
def load(self):
self.load_csv()
self.update_source_and_signals()
def reload_data(self):
# this function is a workaround to reload the data of all the signals
# if the data doesn't change, bokeh does not refresh the line
temp_data = self.bokeh_source.data.copy()
for col in self.bokeh_source.data.keys():
if not self.last_reload_data_fix:
temp_data[col] = temp_data[col][:-1]
self.last_reload_data_fix = not self.last_reload_data_fix
self.bokeh_source.data = temp_data
def change_averaging_window(self, new_size, force=False, signals=None):
if force or self.signals_averaging_window != new_size:
self.signals_averaging_window = new_size
win = np.ones(new_size) / new_size
temp_data = self.bokeh_source_orig.data.copy()
for col in self.bokeh_source.data.keys():
if col == 'index' or col in x_axis_options \
or (signals and not any(col in signal for signal in signals)):
temp_data[col] = temp_data[col][:-new_size]
continue
temp_data[col] = np.convolve(self.bokeh_source_orig.data[col], win, mode='same')[:-new_size]
self.bokeh_source.data = temp_data
# smooth bollinger bands
for signal in self.signals.values():
if signal.has_bollinger_bands:
signal.set_bands_source()
def hide_all_signals(self):
for signal_name in self.signals.keys():
self.set_signal_selection(signal_name, False)
def set_signal_selection(self, signal_name, val):
self.signals[signal_name].set_selected(val)
def change_bollinger_bands_state(self, new_state):
self.show_bollinger_bands = new_state
for signal in self.signals.values():
signal.change_bollinger_bands_state(new_state)
def file_was_modified_on_disk(self):
pass
def get_range_of_selected_signals_on_axis(self, axis, selected_signal=None):
max_val = -float('inf')
min_val = float('inf')
for signal in self.signals.values():
if (selected_signal and signal.name == selected_signal) or (signal.selected and signal.axis == axis):
max_val = max(max_val, signal.max_val)
min_val = min(min_val, signal.min_val)
return min_val, max_val
def get_selected_signals(self):
signals = []
for signal in self.signals.values():
if signal.selected:
signals.append(signal)
return signals
def show_files_separately(self, val):
pass

View File

@@ -0,0 +1,192 @@
import os
from multiprocessing import Process, Manager
from os.path import basename
import pandas as pd
from rl_coach.dashboard_components.globals import x_axis_options, add_directory_csv_files, show_spinner, x_axis
from rl_coach.dashboard_components.signals_file_base import SignalsFileBase
from rl_coach.dashboard_components.signals_file import SignalsFile
class SignalsFilesGroup(SignalsFileBase):
def __init__(self, csv_paths, plot=None):
super().__init__(plot)
self.full_csv_paths = csv_paths
self.signals_files = []
if len(csv_paths) == 1 and os.path.isdir(csv_paths[0]):
self.signals_files = [SignalsFile(str(file), load=False, plot=plot) for file in add_directory_csv_files(csv_paths[0])]
else:
for csv_path in csv_paths:
if os.path.isdir(csv_path):
self.signals_files.append(SignalsFilesGroup(add_directory_csv_files(csv_path), plot=plot))
else:
self.signals_files.append(SignalsFile(str(csv_path), load=False, plot=plot))
parent_directory_path = os.path.abspath(os.path.join(os.path.dirname(csv_paths[0]), '..'))
if len(os.listdir(parent_directory_path)) == 1:
# get the parent directory name (since the current directory is the timestamp directory)
self.dir = parent_directory_path
else:
# get the common directory for all the experiments
self.dir = os.path.dirname('/'.join(os.path.commonprefix(csv_paths).split('/')[:-1]) + '/')
self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files))
self.signal_files_need_update = False
self.load()
def load_csv(self):
global x_axis
# load the csv's for all workers
processes = []
results = Manager().dict()
corrupted_files_idx = []
for idx, signal_file in enumerate(self.signals_files):
if not isinstance(signal_file, SignalsFilesGroup):
processes.append(Process(target=signal_file.load_csv, args=(idx, results)))
processes[-1].start()
[p.join() for p in processes]
# load csv's for SignalsFilesGroup serially for now. TODO: we should later parallelize this as well.
for idx, signal_file in enumerate(self.signals_files):
if isinstance(signal_file, SignalsFilesGroup):
signal_file.load_csv()
for idx, signal_file in enumerate(self.signals_files):
if len(list(results.keys())) > 0:
signal_file.csv, signal_file.last_modified = results[idx]
if not all(option in signal_file.csv.keys() for option in x_axis_options):
print("Warning: {} file seems to be corrupted and does contain the necessary columns "
"and will not be rendered".format(signal_file.filename))
corrupted_files_idx.append(idx)
# remove corrupted worker files
for file_idx in corrupted_files_idx:
del self.signals_files[file_idx]
# get the stats of all the columns
if len(self.signals_files) > 1:
transformed_signals_files = []
subsampling = None
for idx in range(len(self.signals_files)):
transformed_signals_files.append(self.signals_files[idx].csv.copy(deep=True))
# change the index to be the currently selected x axis
transformed_signals_files[-1].index = transformed_signals_files[-1][x_axis[0]]
# remove all duplicate index rows
transformed_signals_files[-1] = transformed_signals_files[-1][~transformed_signals_files[-1].index.duplicated()]
# fill up missing row indices. we are going to take the mean over the group and we want to make sure
# the entire group has some value for every possible index.
num_rows = int(transformed_signals_files[-1].index.values[-1])
transformed_signals_files[-1] = transformed_signals_files[-1].reindex(range(num_rows))
transformed_signals_files[-1].interpolate(inplace=True)
# sub sample the csv to max of 5000 indices (do the same subsampling to all files)
if subsampling is None:
subsampling = max(1, num_rows // 5000)
transformed_signals_files[-1] = transformed_signals_files[-1].iloc[::subsampling, :]
csv_group = pd.concat([signals_file for signals_file in transformed_signals_files])
columns_to_remove = [s for s in csv_group.columns if '/Stdev' in s] + \
[s for s in csv_group.columns if '/Min' in s] + \
[s for s in csv_group.columns if '/Max' in s]
for col in columns_to_remove:
del csv_group[col]
csv_group = csv_group.groupby(csv_group.index)
self.csv_mean = csv_group.mean()
self.csv_mean.columns = [s + '/Mean' for s in self.csv_mean.columns]
self.csv_stdev = csv_group.std()
self.csv_stdev.columns = [s + '/Stdev' for s in self.csv_stdev.columns]
self.csv_min = csv_group.min()
self.csv_min.columns = [s + '/Min' for s in self.csv_min.columns]
self.csv_max = csv_group.max()
self.csv_max.columns = [s + '/Max' for s in self.csv_max.columns]
# get the indices from the file with the least number of indices and which is not an evaluation worker
file_with_min_indices = transformed_signals_files[0]
for signals_file in transformed_signals_files:
if signals_file.shape[0] < file_with_min_indices.shape[0] and \
'Training reward' in signals_file.keys():
file_with_min_indices = signals_file
self.index_columns = file_with_min_indices[x_axis_options]
# concat the stats and the indices columns
num_rows = file_with_min_indices.shape[0]
self.csv = pd.concat([self.index_columns, self.csv_mean.head(num_rows), self.csv_stdev.head(num_rows),
self.csv_min.head(num_rows), self.csv_max.head(num_rows)], axis=1)
# remove the stat columns for the indices columns
columns_to_remove = [s + '/Mean' for s in x_axis_options] + \
[s + '/Stdev' for s in x_axis_options] + \
[s + '/Min' for s in x_axis_options] + \
[s + '/Max' for s in x_axis_options]
for col in columns_to_remove:
if col in self.csv.keys():
del self.csv[col]
else: # This is a group of a single file
self.csv = self.signals_files[0].csv
# remove NaNs
self.csv.fillna(value=0, inplace=True) # removing this line will make bollinger bands fail
for key in self.csv.keys():
if 'Stdev' in key and 'Evaluation' not in key:
self.csv[key] = self.csv[key].fillna(value=0)
self.signal_files_need_update = True
def reload_data(self):
SignalsFileBase.reload_data(self)
def update_x_axis_index(self):
SignalsFileBase.update_x_axis_index(self)
# update the x axis for the bollinger bands
for signal in self.signals.values():
if signal.has_bollinger_bands:
signal.set_bands_source()
def toggle_y_axis(self, signal_name=None):
for signal in self.signals.values():
if signal.selected:
signal.toggle_axis()
def change_averaging_window(self, new_size, force=False, signals=None):
SignalsFileBase.change_averaging_window(self, new_size, force, signals)
def set_signal_selection(self, signal_name, val):
self.show_files_separately(self.separate_files)
SignalsFileBase.set_signal_selection(self, signal_name, val)
def file_was_modified_on_disk(self):
for signal_file in self.signals_files:
if signal_file.file_was_modified_on_disk():
return True
return False
def show_files_separately(self, val):
self.separate_files = val
# lazy updating of the signals of each of the workers
if self.separate_files and self.signal_files_need_update:
for signal_file in self.signals_files:
signal_file.update_source_and_signals()
self.signal_files_need_update = False
for signal in self.signals.values():
if signal.selected:
if val:
signal.set_dash("4 4")
else:
signal.set_dash("")
for signal_file in self.signals_files:
try:
if val:
signal_file.set_signal_selection(signal.name, signal.selected)
else:
signal_file.set_signal_selection(signal.name, False)
except:
pass

View File

@@ -0,0 +1,219 @@
/* based on https://codepen.io/widmr/pen/tklqx by Anreas Widmer */
.spinner {
font-size: 80px;
width: 1em;
height: 1em;
position: fixed;
left: 40%;
top: 20%;
z-index: 9999;
margin: 100px auto;
border-radius: 50%;
list-style: none;
}
.spinner li {
position: absolute;
width: .2em;
height: .2em;
border-radius: 50%;
}
.spinner li:nth-child(1) {
left: 50%;
top: 0;
margin: 0 0 0 -.1em;
background: #00C176;
-webkit-transform-origin: 50% 250%;
-moz-transform-origin: 50% 250%;
-ms-transform-origin: 50% 250%;
-o-transform-origin: 50% 250%;
transform-origin: 50% 250%;
-webkit-animation:
rota 1.13s linear infinite,
opa 3.67s ease-in-out infinite alternate;
-moz-animation:
rota 1.13s linear infinite,
opa 3.67s ease-in-out infinite alternate;
-ms-animation:
rota 1.13s linear infinite,
opa 3.67s ease-in-out infinite alternate;
-o-animation:
rota 1.13s linear infinite,
opa 3.67s ease-in-out infinite alternate;
animation:
rota 1.13s linear infinite,
opa 3.67s ease-in-out infinite alternate;
}
.spinner li:nth-child(2) {
top: 50%;
right: 0;
margin: -.1em 0 0 0;
background: #FF003C;
-webkit-transform-origin: -150% 50%;
-moz-transform-origin: -150% 50%;
-ms-transform-origin: -150% 50%;
-o-transform-origin: -150% 50%;
transform-origin: -150% 50%;
-webkit-animation:
rota 1.86s linear infinite,
opa 4.29s ease-in-out infinite alternate;
-moz-animation:
rota 1.86s linear infinite,
opa 4.29s ease-in-out infinite alternate;
-ms-animation:
rota 1.86s linear infinite,
opa 4.29s ease-in-out infinite alternate;
-o-animation:
rota 1.86s linear infinite,
opa 4.29s ease-in-out infinite alternate;
animation:
rota 1.86s linear infinite,
opa 4.29s ease-in-out infinite alternate;
}
.spinner li:nth-child(3) {
left: 50%;
bottom: 0;
margin: 0 0 0 -.1em;
background: #FABE28;
-webkit-transform-origin: 50% -150%;
-moz-transform-origin: 50% -150%;
-ms-transform-origin: 50% -150%;
-o-transform-origin: 50% -150%;
transform-origin: 50% -150%;
-webkit-animation:
rota 1.45s linear infinite,
opa 5.12s ease-in-out infinite alternate;
-moz-animation:
rota 1.45s linear infinite,
opa 5.12s ease-in-out infinite alternate;
-ms-animation:
rota 1.45s linear infinite,
opa 5.12s ease-in-out infinite alternate;
-o-animation:
rota 1.45s linear infinite,
opa 5.12s ease-in-out infinite alternate;
animation:
rota 1.45s linear infinite,
opa 5.12s ease-in-out infinite alternate;
}
.spinner li:nth-child(4) {
top: 50%;
left 0;
margin: -.1em 0 0 0;
background: #88C100;
-webkit-transform-origin: 250% 50%;
-moz-transform-origin: 250% 50%;
-ms-transform-origin: 250% 50%;
-o-transform-origin: 250% 50%;
transform-origin: 250% 50%;
-webkit-animation:
rota 1.72s linear infinite,
opa 5.25s ease-in-out infinite alternate;
-moz-animation:
rota 1.72s linear infinite,
opa 5.25s ease-in-out infinite alternate;
-ms-animation:
rota 1.72s linear infinite,
opa 5.25s ease-in-out infinite alternate;
-o-animation:
rota 1.72s linear infinite,
opa 5.25s ease-in-out infinite alternate;
animation:
rota 1.72s linear infinite,
opa 5.25s ease-in-out infinite alternate;
}
@-webkit-keyframes rota {
to { -webkit-transform: rotate(360deg); }
}
@-moz-keyframes rota {
to { -moz-transform: rotate(360deg); }
}
@-ms-keyframes rota {
to { -ms-transform: rotate(360deg); }
}
@-o-keyframes rota {
to { -o-transform: rotate(360deg); }
}
@keyframes rota {
to { transform: rotate(360deg); }
}
@-webkit-keyframes opa {
12.0% { opacity: 0.80; }
19.5% { opacity: 0.88; }
37.2% { opacity: 0.64; }
40.5% { opacity: 0.52; }
52.7% { opacity: 0.69; }
60.2% { opacity: 0.60; }
66.6% { opacity: 0.52; }
70.0% { opacity: 0.63; }
79.9% { opacity: 0.60; }
84.2% { opacity: 0.75; }
91.0% { opacity: 0.87; }
}
@-moz-keyframes opa {
12.0% { opacity: 0.80; }
19.5% { opacity: 0.88; }
37.2% { opacity: 0.64; }
40.5% { opacity: 0.52; }
52.7% { opacity: 0.69; }
60.2% { opacity: 0.60; }
66.6% { opacity: 0.52; }
70.0% { opacity: 0.63; }
79.9% { opacity: 0.60; }
84.2% { opacity: 0.75; }
91.0% { opacity: 0.87; }
}
@-ms-keyframes opa {
12.0% { opacity: 0.80; }
19.5% { opacity: 0.88; }
37.2% { opacity: 0.64; }
40.5% { opacity: 0.52; }
52.7% { opacity: 0.69; }
60.2% { opacity: 0.60; }
66.6% { opacity: 0.52; }
70.0% { opacity: 0.63; }
79.9% { opacity: 0.60; }
84.2% { opacity: 0.75; }
91.0% { opacity: 0.87; }
}
@-o-keyframes opa {
12.0% { opacity: 0.80; }
19.5% { opacity: 0.88; }
37.2% { opacity: 0.64; }
40.5% { opacity: 0.52; }
52.7% { opacity: 0.69; }
60.2% { opacity: 0.60; }
66.6% { opacity: 0.52; }
70.0% { opacity: 0.63; }
79.9% { opacity: 0.60; }
84.2% { opacity: 0.75; }
91.0% { opacity: 0.87; }
}
@keyframes opa {
12.0% { opacity: 0.80; }
19.5% { opacity: 0.88; }
37.2% { opacity: 0.64; }
40.5% { opacity: 0.52; }
52.7% { opacity: 0.69; }
60.2% { opacity: 0.60; }
66.6% { opacity: 0.52; }
70.0% { opacity: 0.63; }
79.9% { opacity: 0.60; }
84.2% { opacity: 0.75; }
91.0% { opacity: 0.87; }
}

View File

Binary file not shown.

Binary file not shown.

77
rl_coach/debug_utils.py Normal file
View File

@@ -0,0 +1,77 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import math
import matplotlib.pyplot as plt
import numpy as np
from rl_coach.filters.observation.observation_stacking_filter import LazyStack
def show_observation_stack(stack, channels_last=True, show=True, force_num_rows=None, row_to_update=0):
if isinstance(stack, LazyStack):
stack = np.array(stack)
if isinstance(stack, list): # is list
stack_size = len(stack)
elif len(stack.shape) == 3:
stack_size = stack.shape[0] # is numpy array
elif len(stack.shape) == 4:
stack_size = stack.shape[1] # ignore batch dimension
stack = stack[0]
else:
raise ValueError("The observation stack must be a list, a numpy array or a LazyStack object")
if channels_last:
stack = np.transpose(stack, (2, 0, 1))
stack_size = stack.shape[0]
max_cols = 10
if force_num_rows:
rows = force_num_rows
else:
rows = math.ceil(stack_size / max_cols)
cols = max_cols if stack_size > max_cols else stack_size
for i in range(stack_size):
plt.subplot(rows, cols, row_to_update * cols + i + 1)
plt.imshow(stack[i], cmap='gray')
if show:
plt.show()
def show_diff_between_two_observations(observation1, observation2):
plt.imshow(observation1 - observation2, cmap='gray')
plt.show()
def plot_grayscale_observation(observation):
plt.imshow(observation, cmap='gray')
plt.show()
def plot_episode_states(episode_transitions, state_variable: str='state', observation_index_in_stack: int=0):
observations = []
for transition in episode_transitions:
observations.append(np.array(getattr(transition, state_variable)['observation'])[..., observation_index_in_stack])
show_observation_stack(observations, False)
def plot_list_of_observation_stacks(observation_stacks):
for idx, stack in enumerate(observation_stacks):
show_observation_stack(stack['observation'], True, False,
force_num_rows=len(observation_stacks), row_to_update=idx)
plt.show()

View File

@@ -0,0 +1,112 @@
; Example of settings file for CARLA.
;
; This file can be loaded with the Python client to be sent to the server. It
; defines the parameters to be used when requesting a new episode.
;
; Note that server specific variables are only loaded when launching the
; simulator. Use it with `./CarlaUE4.sh -carla-settings=Path/To/This/File`.
[CARLA/Server]
; If set to false, a mock controller will be used instead of waiting for a real
; client to connect. (Server only)
UseNetworking=false
; Ports to use for the server-client communication. This can be overridden by
; the command-line switch `-world-port=N`, write and read ports will be set to
; N+1 and N+2 respectively. (Server only)
WorldPort=2000
; Time-out in milliseconds for the networking operations. (Server only)
ServerTimeOut=100000000000
; In synchronous mode, CARLA waits every frame until the control from the client
; is received.
SynchronousMode=true
; Send info about every non-player agent in the scene every frame, the
; information is attached to the measurements message. This includes other
; vehicles, pedestrians and traffic signs. Disabled by default to improve
; performance.
SendNonPlayerAgentsInfo=false
[CARLA/QualitySettings]
; Quality level of the graphics, a lower level makes the simulation run
; considerably faster. Available: Low or Epic.
QualityLevel=Low
[CARLA/LevelSettings]
; Path of the vehicle class to be used for the player. Leave empty for default.
; Paths follow the pattern "/Game/Blueprints/Vehicles/Mustang/Mustang.Mustang_C"
PlayerVehicle=
; Number of non-player vehicles to be spawned into the level.
NumberOfVehicles=15
; Number of non-player pedestrians to be spawned into the level.
NumberOfPedestrians=30
; Index of the weather/lighting presets to use. If negative, the default presets
; of the map will be used.
WeatherId=1
; Seeds for the pseudo-random number generators.
SeedVehicles=123456789
SeedPedestrians=123456789
[CARLA/Sensor]
; Names of the sensors to be attached to the player, comma-separated, each of
; them should be defined in its own subsection.
; Uncomment next line to add a camera called FrontCamera to the vehicle
Sensors=FrontCamera
; or uncomment next line to add a camera and a Lidar
; Sensors=FrontCamera,MyLidar
; or uncomment next line to add a regular camera and a depth camera
; Sensors=FrontCamera,FrontCamera/Depth
; Now, every camera we added needs to be defined it in its own subsection.
[CARLA/Sensor/FrontCamera]
; Type of the sensor. The available types are:
; * CAMERA A scene capture camera.
; * LIDAR_RAY_CAST A Lidar implementation based on ray-casting.
SensorType=CAMERA
; Post-processing effect to be applied to this camera. Valid values:
; * None No effects applied.
; * SceneFinal Post-processing present at scene (bloom, fog, etc).
; * Depth Depth map ground-truth only.
; * SemanticSegmentation Semantic segmentation ground-truth only.
PostProcessing=SceneFinal
; Size of the captured image in pixels.
ImageSizeX=360
ImageSizeY=256
; Camera (horizontal) field of view in degrees.
FOV=90
; Position of the camera relative to the car in meters.
PositionX=0.20
PositionY=0
PositionZ=1.30
; Rotation of the camera relative to the car in degrees.
RotationPitch=8
RotationRoll=0
RotationYaw=0
[CARLA/Sensor/FrontCamera/Depth]
; The sensor can be defined in a subsection of FrontCamera so it inherits the
; values in FrontCamera. This adds a camera similar to FrontCamera but generating
; depth map images instead.
PostProcessing=Depth
[CARLA/Sensor/MyLidar]
SensorType=LIDAR_RAY_CAST
; Number of lasers.
Channels=32
; Measure distance in meters.
Range=50.0
; Points generated by all lasers per second.
PointsPerSecond=100000
; Lidar rotation frequency.
RotationFrequency=10
; Upper and lower laser angles, positive values means above horizontal line.
UpperFOVLimit=10
LowerFOVLimit=-30
; Position and rotation relative to the vehicle.
PositionX=0
PositionY=0
PositionZ=1.40
RotationPitch=0
RotationYaw=0
RotationRoll=0

View File

@@ -0,0 +1,19 @@
A custom environment implementation should look like this:
```bash
from coach.filters.input_filter import InputFilter
class CustomFilter(InputFilter):
def __init__(self):
...
def _filter(self, env_response: EnvResponse) -> EnvResponse:
...
def _get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace:
...
def _get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
...
def _validate_input_observation_space(self, input_observation_space: ObservationSpace):
...
def _reset(self):
...
```

View File

@@ -0,0 +1,16 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

View File

@@ -0,0 +1,357 @@
import random
import sys
from os import path, environ
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
try:
if 'CARLA_ROOT' in environ:
sys.path.append(path.join(environ.get('CARLA_ROOT'), 'PythonClient'))
from carla.client import CarlaClient
from carla.settings import CarlaSettings
from carla.tcp import TCPConnectionError
from carla.sensor import Camera
from carla.client import VehicleControl
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("CARLA")
import logging
import subprocess
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, StateSpace, \
VectorObservationSpace
from rl_coach.utils import get_open_port, force_list
from enum import Enum
import os
import signal
from typing import List, Union
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.filters.filter import InputFilter, NoOutputFilter
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
import numpy as np
# enum of the available levels and their path
class CarlaLevel(Enum):
TOWN1 = "/Game/Maps/Town01"
TOWN2 = "/Game/Maps/Town02"
key_map = {
'BRAKE': (274,), # down arrow
'GAS': (273,), # up arrow
'TURN_LEFT': (276,), # left arrow
'TURN_RIGHT': (275,), # right arrow
'GAS_AND_TURN_LEFT': (273, 276),
'GAS_AND_TURN_RIGHT': (273, 275),
'BRAKE_AND_TURN_LEFT': (274, 276),
'BRAKE_AND_TURN_RIGHT': (274, 275),
}
CarlaInputFilter = InputFilter(is_a_reference_filter=True)
CarlaInputFilter.add_observation_filter('forward_camera', 'rescaling',
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([128, 180, 3]),
high=255)))
CarlaInputFilter.add_observation_filter('forward_camera', 'to_grayscale', ObservationRGBToYFilter())
CarlaInputFilter.add_observation_filter('forward_camera', 'to_uint8', ObservationToUInt8Filter(0, 255))
CarlaInputFilter.add_observation_filter('forward_camera', 'stacking', ObservationStackingFilter(4))
CarlaOutputFilter = NoOutputFilter()
class CameraTypes(Enum):
FRONT = "forward_camera"
LEFT = "left_camera"
RIGHT = "right_camera"
SEGMENTATION = "segmentation"
DEPTH = "depth"
LIDAR = "lidar"
class CarlaEnvironmentParameters(EnvironmentParameters):
class Quality(Enum):
LOW = "Low"
EPIC = "Epic"
def __init__(self):
super().__init__()
self.frame_skip = 3 # the frame skip affects the fps of the server directly. fps = 30 / frameskip
self.server_height = 512
self.server_width = 720
self.camera_height = 128
self.camera_width = 180
self.config = None #'environments/CarlaSettings.ini' # TODO: remove the config to prevent confusion
self.level = 'town1'
self.quality = self.Quality.LOW
self.cameras = [CameraTypes.FRONT]
self.weather_id = [1]
self.verbose = True
self.episode_max_time = 100000 # miliseconds for each episode
self.allow_braking = False
self.default_input_filter = CarlaInputFilter
self.default_output_filter = CarlaOutputFilter
@property
def path(self):
return 'rl_coach.environments.carla_environment:CarlaEnvironment'
class CarlaEnvironment(Environment):
def __init__(self, level: LevelSelection,
seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float],
visualization_parameters: VisualizationParameters,
server_height: int, server_width: int, camera_height: int, camera_width: int,
verbose: bool, config: str, episode_max_time: int,
allow_braking: bool, quality: CarlaEnvironmentParameters.Quality,
cameras: List[CameraTypes], weather_id: List[int], **kwargs):
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
# server configuration
self.server_height = server_height
self.server_width = server_width
self.port = get_open_port()
self.host = 'localhost'
self.map = self.env_id
# client configuration
self.verbose = verbose
self.quality = quality
self.cameras = cameras
self.weather_id = weather_id
self.episode_max_time = episode_max_time
self.allow_braking = allow_braking
self.camera_width = camera_width
self.camera_height = camera_height
# state space
self.state_space = StateSpace({
"measurements": VectorObservationSpace(4, measurements_names=["forward_speed", "x", "y", "z"])
})
for camera in self.cameras:
self.state_space[camera.value] = ImageObservationSpace(
shape=np.array([self.camera_height, self.camera_width, 3]),
high=255)
# setup server settings
self.config = config
if self.config:
# load settings from file
with open(self.config, 'r') as fp:
self.settings = fp.read()
else:
# hard coded settings
self.settings = CarlaSettings()
self.settings.set(
SynchronousMode=True,
SendNonPlayerAgentsInfo=False,
NumberOfVehicles=15,
NumberOfPedestrians=30,
WeatherId=random.choice(force_list(self.weather_id)),
QualityLevel=self.quality.value)
self.settings.randomize_seeds()
self.settings = self._add_cameras(self.settings, self.cameras, self.camera_width, self.camera_height)
# open the server
self.server = self._open_server()
logging.disable(40)
# open the client
self.game = CarlaClient(self.host, self.port, timeout=99999999)
self.game.connect()
scene = self.game.load_settings(self.settings)
# get available start positions
positions = scene.player_start_spots
self.num_pos = len(positions)
self.iterator_start_positions = 0
# action space
self.action_space = BoxActionSpace(shape=2, low=np.array([-1, -1]), high=np.array([1, 1]))
# human control
if self.human_control:
# convert continuous action space to discrete
self.steering_strength = 0.5
self.gas_strength = 1.0
self.brake_strength = 0.5
self.action_space = PartialDiscreteActionSpaceMap(
target_actions=[[0., 0.],
[0., -self.steering_strength],
[0., self.steering_strength],
[self.gas_strength, 0.],
[-self.brake_strength, 0],
[self.gas_strength, -self.steering_strength],
[self.gas_strength, self.steering_strength],
[self.brake_strength, -self.steering_strength],
[self.brake_strength, self.steering_strength]],
target_action_space=self.action_space,
descriptions=['NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE',
'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT',
'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT']
)
# map keyboard keys to actions
for idx, action in enumerate(self.action_space.descriptions):
for key in key_map.keys():
if action == key:
self.key_to_action[key_map[key]] = idx
self.num_speedup_steps = 30
# measurements
self.autopilot = None
# env initialization
self.reset_internal_state(True)
# render
if self.is_rendered:
image = self.get_rendered_image()
self.renderer.create_screen(image.shape[1], image.shape[0])
def _add_cameras(self, settings, cameras, camera_width, camera_height):
# add a front facing camera
if CameraTypes.FRONT in cameras:
camera = Camera(CameraTypes.FRONT.value)
camera.set_image_size(camera_width, camera_height)
camera.set_position(0.2, 0, 1.3)
camera.set_rotation(8, 0, 0)
settings.add_sensor(camera)
# add a left facing camera
if CameraTypes.LEFT in cameras:
camera = Camera(CameraTypes.LEFT.value)
camera.set_image_size(camera_width, camera_height)
camera.set_position(0.2, 0, 1.3)
camera.set_rotation(8, -30, 0)
settings.add_sensor(camera)
# add a right facing camera
if CameraTypes.RIGHT in cameras:
camera = Camera(CameraTypes.RIGHT.value)
camera.set_image_size(camera_width, camera_height)
camera.set_position(0.2, 0, 1.3)
camera.set_rotation(8, 30, 0)
settings.add_sensor(camera)
# add a front facing depth camera
if CameraTypes.DEPTH in cameras:
camera = Camera(CameraTypes.DEPTH.value)
camera.set_image_size(camera_width, camera_height)
camera.set_position(0.2, 0, 1.3)
camera.set_rotation(8, 30, 0)
camera.PostProcessing = 'Depth'
settings.add_sensor(camera)
# add a front facing semantic segmentation camera
if CameraTypes.SEGMENTATION in cameras:
camera = Camera(CameraTypes.SEGMENTATION.value)
camera.set_image_size(camera_width, camera_height)
camera.set_position(0.2, 0, 1.3)
camera.set_rotation(8, 30, 0)
camera.PostProcessing = 'SemanticSegmentation'
settings.add_sensor(camera)
return settings
def _open_server(self):
# TODO: get experiment path
log_path = path.join('./logs/', "CARLA_LOG_{}.txt".format(self.port))
with open(log_path, "wb") as out:
cmd = [path.join(environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map,
"-benchmark", "-carla-server", "-fps={}".format(30 / self.frame_skip),
"-world-port={}".format(self.port),
"-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height),
"-carla-no-hud"]
if self.config:
cmd.append("-carla-settings={}".format(self.config))
p = subprocess.Popen(cmd, stdout=out, stderr=out)
return p
def _close_server(self):
os.killpg(os.getpgid(self.server.pid), signal.SIGKILL)
def _update_state(self):
# get measurements and observations
measurements = []
while type(measurements) == list:
measurements, sensor_data = self.game.read_data()
self.state = {}
for camera in self.cameras:
self.state[camera.value] = sensor_data[camera.value].data
self.location = [measurements.player_measurements.transform.location.x,
measurements.player_measurements.transform.location.y,
measurements.player_measurements.transform.location.z]
is_collision = measurements.player_measurements.collision_vehicles != 0 \
or measurements.player_measurements.collision_pedestrians != 0 \
or measurements.player_measurements.collision_other != 0
speed_reward = measurements.player_measurements.forward_speed - 1
if speed_reward > 30.:
speed_reward = 30.
self.reward = speed_reward \
- (measurements.player_measurements.intersection_otherlane * 5) \
- (measurements.player_measurements.intersection_offroad * 5) \
- is_collision * 100 \
- np.abs(self.control.steer) * 10
# update measurements
self.measurements = [measurements.player_measurements.forward_speed] + self.location
self.autopilot = measurements.player_measurements.autopilot_control
# action_p = ['%.2f' % member for member in [self.control.throttle, self.control.steer]]
# screen.success('REWARD: %.2f, ACTIONS: %s' % (self.reward, action_p))
if (measurements.game_timestamp >= self.episode_max_time) or is_collision:
# screen.success('EPISODE IS DONE. GameTime: {}, Collision: {}'.format(str(measurements.game_timestamp),
# str(is_collision)))
self.done = True
self.state['measurements'] = self.measurements
def _take_action(self, action):
self.control = VehicleControl()
self.control.throttle = np.clip(action[0], 0, 1)
self.control.steer = np.clip(action[1], -1, 1)
self.control.brake = np.abs(np.clip(action[0], -1, 0))
if not self.allow_braking:
self.control.brake = 0
self.control.hand_brake = False
self.control.reverse = False
self.game.send_control(self.control)
def _restart_environment_episode(self, force_environment_reset=False):
self.iterator_start_positions += 1
if self.iterator_start_positions >= self.num_pos:
self.iterator_start_positions = 0
try:
self.game.start_episode(self.iterator_start_positions)
except:
self.game.connect()
self.game.start_episode(self.iterator_start_positions)
# start the game with some initial speed
for i in range(self.num_speedup_steps):
self._take_action([1.0, 0])
def get_rendered_image(self) -> np.ndarray:
"""
Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen
"""
image = [self.state[camera.value] for camera in self.cameras]
image = np.vstack(image)
return image

View File

@@ -0,0 +1,162 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import random
from enum import Enum
from typing import Union
import numpy as np
try:
from dm_control import suite
from dm_control.suite.wrappers import pixels
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("DeepMind Control Suite")
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, VectorObservationSpace, StateSpace
class ObservationType(Enum):
Measurements = 1
Image = 2
Image_and_Measurements = 3
# Parameters
class ControlSuiteEnvironmentParameters(EnvironmentParameters):
def __init__(self):
super().__init__()
self.observation_type = ObservationType.Measurements
self.default_input_filter = ControlSuiteInputFilter
self.default_output_filter = ControlSuiteOutputFilter
@property
def path(self):
return 'rl_coach.environments.control_suite_environment:ControlSuiteEnvironment'
"""
ControlSuite Environment Components
"""
ControlSuiteInputFilter = NoInputFilter()
ControlSuiteOutputFilter = NoOutputFilter()
control_suite_envs = {':'.join(env): ':'.join(env) for env in suite.BENCHMARKING}
# Environment
class ControlSuiteEnvironment(Environment):
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
seed: Union[None, int]=None, human_control: bool=False,
observation_type: ObservationType=ObservationType.Measurements,
custom_reward_threshold: Union[int, float]=None, **kwargs):
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
self.observation_type = observation_type
# load and initialize environment
domain_name, task_name = self.env_id.split(":")
self.env = suite.load(domain_name=domain_name, task_name=task_name)
if observation_type != ObservationType.Measurements:
self.env = pixels.Wrapper(self.env, pixels_only=observation_type == ObservationType.Image)
# seed
if self.seed is not None:
np.random.seed(self.seed)
random.seed(self.seed)
self.state_space = StateSpace({})
# image observations
if observation_type != ObservationType.Measurements:
self.state_space['pixels'] = ImageObservationSpace(shape=self.env.observation_spec()['pixels'].shape,
high=255)
# measurements observations
if observation_type != ObservationType.Image:
measurements_space_size = 0
measurements_names = []
for observation_space_name, observation_space in self.env.observation_spec().items():
if len(observation_space.shape) == 0:
measurements_space_size += 1
measurements_names.append(observation_space_name)
elif len(observation_space.shape) == 1:
measurements_space_size += observation_space.shape[0]
measurements_names.extend(["{}_{}".format(observation_space_name, i) for i in
range(observation_space.shape[0])])
self.state_space['measurements'] = VectorObservationSpace(shape=measurements_space_size,
measurements_names=measurements_names)
# actions
self.action_space = BoxActionSpace(
shape=self.env.action_spec().shape[0],
low=self.env.action_spec().minimum,
high=self.env.action_spec().maximum
)
# initialize the state by getting a new state from the environment
self.reset_internal_state(True)
# render
if self.is_rendered:
image = self.get_rendered_image()
scale = 1
if self.human_control:
scale = 2
if not self.native_rendering:
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
def _update_state(self):
self.state = {}
if self.observation_type != ObservationType.Measurements:
self.pixels = self.last_result.observation['pixels']
self.state['pixels'] = self.pixels
if self.observation_type != ObservationType.Image:
self.measurements = np.array([])
for sub_observation in self.last_result.observation.values():
if isinstance(sub_observation, np.ndarray) and len(sub_observation.shape) == 1:
self.measurements = np.concatenate((self.measurements, sub_observation))
else:
self.measurements = np.concatenate((self.measurements, np.array([sub_observation])))
self.state['measurements'] = self.measurements
self.reward = self.last_result.reward if self.last_result.reward is not None else 0
self.done = self.last_result.last()
def _take_action(self, action):
if type(self.action_space) == BoxActionSpace:
action = self.action_space.clip_action_to_space(action)
self.last_result = self.env.step(action)
def _restart_environment_episode(self, force_environment_reset=False):
self.last_result = self.env.reset()
def _render(self):
pass
def get_rendered_image(self):
return self.env.physics.render(camera_id=0)

View File

@@ -0,0 +1,39 @@
# Lines starting with # are treated as comments (or with whitespaces+#).
# It doesn't matter if you use capital letters or not.
# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
doom_scenario_path = D2_navigation.wad
doom_map = map01
# Rewards
# Each step is good for you!
living_reward = 1
# And death is not!
death_penalty = 0
# Rendering options
screen_resolution = RES_160X120
screen_format = GRAY8
render_hud = false
render_crosshair = false
render_weapon = false
render_decals = false
render_particles = false
window_visible = false
# make episodes finish after 2100 actions (tics)
episode_timeout = 2100
# Available buttons
available_buttons =
{
TURN_LEFT
TURN_RIGHT
MOVE_FORWARD
}
# Game variables that will be in the state
available_game_variables = { HEALTH }
mode = PLAYER

Binary file not shown.

View File

@@ -0,0 +1,44 @@
# Lines starting with # are treated as comments (or with whitespaces+#).
# It doesn't matter if you use capital letters or not.
# It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
# modifty these to point to your vizdoom binary and freedoom2.wad
doom_scenario_path = D3_battle.wad
doom_map = map01
# Rewards
living_reward = 0
death_penalty = 0
# Rendering options
screen_resolution = RES_320X240
screen_format = CRCGCB
render_hud = false
render_crosshair = true
render_weapon = true
render_decals = false
render_particles = false
window_visible = false
# make episodes finish after 2100 actions (tics)
episode_timeout = 2100
# Available buttons
available_buttons =
{
MOVE_FORWARD
MOVE_BACKWARD
MOVE_RIGHT
MOVE_LEFT
TURN_LEFT
TURN_RIGHT
ATTACK
SPEED
}
# Game variables that will be in the state
available_game_variables = {AMMO2 HEALTH USER2}
mode = PLAYER
doom_skill = 2

Binary file not shown.

View File

@@ -0,0 +1,229 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
try:
import vizdoom
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("ViZDoom")
import os
from enum import Enum
from os import path, environ
from typing import Union, List
import numpy as np
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.filters.action.full_discrete_action_space_map import FullDiscreteActionSpaceMap
from rl_coach.filters.filter import InputFilter, OutputFilter
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
from rl_coach.spaces import MultiSelectActionSpace, ImageObservationSpace, \
VectorObservationSpace, StateSpace
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
# enum of the available levels and their path
class DoomLevel(Enum):
BASIC = "basic.cfg"
DEFEND = "defend_the_center.cfg"
DEATHMATCH = "deathmatch.cfg"
MY_WAY_HOME = "my_way_home.cfg"
TAKE_COVER = "take_cover.cfg"
HEALTH_GATHERING = "health_gathering.cfg"
HEALTH_GATHERING_SUPREME_COACH_LOCAL = "D2_navigation.cfg" # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
DEFEND_THE_LINE = "defend_the_line.cfg"
DEADLY_CORRIDOR = "deadly_corridor.cfg"
BATTLE_COACH_LOCAL = "D3_battle.cfg" # from https://github.com/IntelVCL/DirectFuturePrediction/tree/master/maps
key_map = {
'NO-OP': 96, # `
'ATTACK': 13, # enter
'CROUCH': 306, # ctrl
'DROP_SELECTED_ITEM': ord("t"),
'DROP_SELECTED_WEAPON': ord("t"),
'JUMP': 32, # spacebar
'LAND': ord("l"),
'LOOK_DOWN': 274, # down arrow
'LOOK_UP': 273, # up arrow
'MOVE_BACKWARD': ord("s"),
'MOVE_DOWN': ord("s"),
'MOVE_FORWARD': ord("w"),
'MOVE_LEFT': 276,
'MOVE_RIGHT': 275,
'MOVE_UP': ord("w"),
'RELOAD': ord("r"),
'SELECT_NEXT_WEAPON': ord("q"),
'SELECT_PREV_WEAPON': ord("e"),
'SELECT_WEAPON0': ord("0"),
'SELECT_WEAPON1': ord("1"),
'SELECT_WEAPON2': ord("2"),
'SELECT_WEAPON3': ord("3"),
'SELECT_WEAPON4': ord("4"),
'SELECT_WEAPON5': ord("5"),
'SELECT_WEAPON6': ord("6"),
'SELECT_WEAPON7': ord("7"),
'SELECT_WEAPON8': ord("8"),
'SELECT_WEAPON9': ord("9"),
'SPEED': 304, # shift
'STRAFE': 9, # tab
'TURN180': ord("u"),
'TURN_LEFT': ord("a"), # left arrow
'TURN_RIGHT': ord("d"), # right arrow
'USE': ord("f"),
}
DoomInputFilter = InputFilter(is_a_reference_filter=True)
DoomInputFilter.add_observation_filter('observation', 'rescaling',
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([60, 76, 3]),
high=255)))
DoomInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
DoomInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
DoomInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(3))
DoomOutputFilter = OutputFilter(is_a_reference_filter=True)
DoomOutputFilter.add_action_filter('to_discrete', FullDiscreteActionSpaceMap())
class DoomEnvironmentParameters(EnvironmentParameters):
def __init__(self):
super().__init__()
self.default_input_filter = DoomInputFilter
self.default_output_filter = DoomOutputFilter
self.cameras = [DoomEnvironment.CameraTypes.OBSERVATION]
@property
def path(self):
return 'rl_coach.environments.doom_environment:DoomEnvironment'
class DoomEnvironment(Environment):
class CameraTypes(Enum):
OBSERVATION = ("observation", "screen_buffer")
DEPTH = ("depth", "depth_buffer")
LABELS = ("labels", "labels_buffer")
MAP = ("map", "automap_buffer")
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
cameras: List[CameraTypes], **kwargs):
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
self.cameras = cameras
# load the emulator with the required level
self.level = DoomLevel[level.upper()]
local_scenarios_path = path.join(os.path.dirname(os.path.realpath(__file__)), 'doom')
self.scenarios_dir = local_scenarios_path if 'COACH_LOCAL' in level \
else path.join(environ.get('VIZDOOM_ROOT'), 'scenarios')
self.game = vizdoom.DoomGame()
self.game.load_config(path.join(self.scenarios_dir, self.level.value))
self.game.set_window_visible(False)
self.game.add_game_args("+vid_forcesurface 1")
self.wait_for_explicit_human_action = True
if self.human_control:
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_640X480)
elif self.is_rendered:
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_320X240)
else:
# lower resolution since we actually take only 76x60 and we don't need to render
self.game.set_screen_resolution(vizdoom.ScreenResolution.RES_160X120)
self.game.set_render_hud(False)
self.game.set_render_crosshair(False)
self.game.set_render_decals(False)
self.game.set_render_particles(False)
for camera in self.cameras:
if hasattr(self.game, 'set_{}_enabled'.format(camera.value[1])):
getattr(self.game, 'set_{}_enabled'.format(camera.value[1]))(True)
self.game.init()
# actions
actions_description = ['NO-OP']
actions_description += [str(action).split(".")[1] for action in self.game.get_available_buttons()]
actions_description = actions_description[::-1]
self.action_space = MultiSelectActionSpace(self.game.get_available_buttons_size(),
max_simultaneous_selected_actions=1,
descriptions=actions_description,
allow_no_action_to_be_selected=True)
# human control
if self.human_control:
# TODO: add this to the action space
# map keyboard keys to actions
for idx, action in enumerate(self.action_space.descriptions):
if action in key_map.keys():
self.key_to_action[(key_map[action],)] = idx
# states
self.state_space = StateSpace({
"measurements": VectorObservationSpace(self.game.get_state().game_variables.shape[0],
measurements_names=[str(m) for m in
self.game.get_available_game_variables()])
})
for camera in self.cameras:
self.state_space[camera.value[0]] = ImageObservationSpace(
shape=np.array([self.game.get_screen_height(), self.game.get_screen_width(), 3]),
high=255)
# seed
if seed is not None:
self.game.set_seed(seed)
self.reset_internal_state()
# render
if self.is_rendered:
image = self.get_rendered_image()
self.renderer.create_screen(image.shape[1], image.shape[0])
def _update_state(self):
# extract all data from the current state
state = self.game.get_state()
if state is not None and state.screen_buffer is not None:
self.measurements = state.game_variables
self.state = {'measurements': self.measurements}
for camera in self.cameras:
observation = getattr(state, camera.value[1])
if len(observation.shape) == 3:
self.state[camera.value[0]] = np.transpose(observation, (1, 2, 0))
elif len(observation.shape) == 2:
self.state[camera.value[0]] = np.repeat(np.expand_dims(observation, -1), 3, axis=-1)
self.reward = self.game.get_last_reward()
self.done = self.game.is_episode_finished()
def _take_action(self, action):
self.game.make_action(list(action), self.frame_skip)
def _restart_environment_episode(self, force_environment_reset=False):
self.game.new_episode()
def get_rendered_image(self) -> np.ndarray:
"""
Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen
"""
image = [self.state[camera.value[0]] for camera in self.cameras]
image = np.vstack(image)
return image

View File

@@ -0,0 +1,540 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import operator
import time
from collections import OrderedDict
from typing import Union, List, Tuple, Dict
import numpy as np
from rl_coach.base_parameters import Parameters
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.core_types import GoalType, ActionType, EnvResponse, RunPhase
from rl_coach.renderer import Renderer
from rl_coach.spaces import ActionSpace, ObservationSpace, DiscreteActionSpace, RewardSpace, StateSpace
from rl_coach.utils import squeeze_list, force_list
from rl_coach import logger
from rl_coach.environments.environment_interface import EnvironmentInterface
from rl_coach.logger import screen
class LevelSelection(object):
def __init__(self, level: str):
self.selected_level = level
def select(self, level: str):
self.selected_level = level
def __str__(self):
if self.selected_level is None:
logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
"or change the level in the preset.", crash=True)
return self.selected_level
class SingleLevelSelection(LevelSelection):
def __init__(self, levels: Union[str, List[str], Dict[str, str]]):
super().__init__(None)
self.levels = levels
if isinstance(levels, list):
self.levels = {level: level for level in levels}
if isinstance(levels, str):
self.levels = {levels: levels}
def __str__(self):
if self.selected_level is None:
logger.screen.error("No level has been selected. Please select a level using the -lvl command line flag, "
"or change the level in the preset. \nThe available levels are: \n{}"
.format(', '.join(self.levels.keys())), crash=True)
if self.selected_level not in self.levels.keys():
logger.screen.error("The selected level ({}) is not part of the available levels ({})"
.format(self.selected_level, ', '.join(self.levels.keys())), crash=True)
return self.levels[self.selected_level]
# class SingleLevelPerPhase(LevelSelection):
# def __init__(self, levels: Dict[RunPhase, str]):
# super().__init__(None)
# self.levels = levels
#
# def __str__(self):
# super().__str__()
# if self.selected_level not in self.levels.keys():
# logger.screen.error("The selected level ({}) is not part of the available levels ({})"
# .format(self.selected_level, self.levels.keys()), crash=True)
# return self.levels[self.selected_level]
class CustomWrapper(object):
def __init__(self, environment):
super().__init__()
self.environment = environment
def __getattr__(self, attr):
if attr in self.__dict__:
return self.__dict__[attr]
else:
return getattr(self.environment, attr, False)
class EnvironmentParameters(Parameters):
def __init__(self):
super().__init__()
self.level = None
self.frame_skip = 4
self.seed = None
self.human_control = False
self.custom_reward_threshold = None
self.default_input_filter = None
self.default_output_filter = None
@property
def path(self):
return 'rl_coach.environments.environment:Environment'
class Environment(EnvironmentInterface):
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool,
custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters,
**kwargs):
"""
:param level: The environment level. Each environment can have multiple levels
:param seed: a seed for the random number generator of the environment
:param frame_skip: number of frames to skip (while repeating the same action) between each two agent directives
:param human_control: human should control the environment
:param visualization_parameters: a blob of parameters used for visualization of the environment
:param **kwargs: as the class is instantiated by EnvironmentParameters, this is used to support having
additional arguments which will be ignored by this class, but might be used by others
"""
super().__init__()
# env initialization
self.game = []
self.state = {}
self.observation = None
self.goal = None
self.reward = 0
self.done = False
self.info = {}
self._last_env_response = None
self.last_action = 0
self.episode_idx = 0
self.total_steps_counter = 0
self.current_episode_steps_counter = 0
self.last_episode_time = time.time()
self.key_to_action = {}
self.last_episode_images = []
# rewards
self.total_reward_in_current_episode = 0
self.max_reward_achieved = -np.inf
self.reward_success_threshold = custom_reward_threshold
# spaces
self.state_space = self._state_space = None
self.goal_space = self._goal_space = None
self.action_space = self._action_space = None
self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold) # TODO: add a getter and setter
self.env_id = str(level)
self.seed = seed
self.frame_skip = frame_skip
# human interaction and visualization
self.human_control = human_control
self.wait_for_explicit_human_action = False
self.is_rendered = visualization_parameters.render or self.human_control
self.native_rendering = visualization_parameters.native_rendering or self.human_control
self.visualization_parameters = visualization_parameters
if not self.native_rendering:
self.renderer = Renderer()
@property
def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
"""
Get the action space of the environment
:return: the action space
"""
return self._action_space
@action_space.setter
def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
"""
Set the action space of the environment
:return: None
"""
self._action_space = val
@property
def state_space(self) -> Union[List[StateSpace], StateSpace]:
"""
Get the state space of the environment
:return: the observation space
"""
return self._state_space
@state_space.setter
def state_space(self, val: Union[List[StateSpace], StateSpace]):
"""
Set the state space of the environment
:return: None
"""
self._state_space = val
@property
def goal_space(self) -> Union[List[ObservationSpace], ObservationSpace]:
"""
Get the state space of the environment
:return: the observation space
"""
return self._goal_space
@goal_space.setter
def goal_space(self, val: Union[List[ObservationSpace], ObservationSpace]):
"""
Set the goal space of the environment
:return: None
"""
self._goal_space = val
def get_action_from_user(self) -> ActionType:
"""
Get an action from the user keyboard
:return: action index
"""
if self.wait_for_explicit_human_action:
while len(self.renderer.pressed_keys) == 0:
self.renderer.get_events()
if self.key_to_action == {}:
# the keys are the numbers on the keyboard corresponding to the action index
if len(self.renderer.pressed_keys) > 0:
action_idx = self.renderer.pressed_keys[0] - ord("1")
if 0 <= action_idx < self.action_space.shape[0]:
return action_idx
else:
# the keys are mapped through the environment to more intuitive keyboard keys
# key = tuple(self.renderer.pressed_keys)
# for key in self.renderer.pressed_keys:
for env_keys in self.key_to_action.keys():
if set(env_keys) == set(self.renderer.pressed_keys):
return self.action_space.actions[self.key_to_action[env_keys]]
# return the default action 0 so that the environment will continue running
return self.action_space.default_action
@property
def last_env_response(self) -> Union[List[EnvResponse], EnvResponse]:
"""
Get the last environment response
:return: a dictionary that contains the state, reward, etc.
"""
return squeeze_list(self._last_env_response)
@last_env_response.setter
def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]):
"""
Set the last environment response
:param val: the last environment response
"""
self._last_env_response = force_list(val)
def step(self, action: ActionType) -> EnvResponse:
"""
Make a single step in the environment using the given action
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
:return: the environment response as returned in get_last_env_response
"""
action = self.action_space.clip_action_to_space(action)
if self.action_space and not self.action_space.val_matches_space_definition(action):
raise ValueError("The given action does not match the action space definition. "
"Action = {}, action space definition = {}".format(action, self.action_space))
# store the last agent action done and allow passing None actions to repeat the previously done action
if action is None:
action = self.last_action
self.last_action = action
if self.visualization_parameters.add_rendered_image_to_env_response:
current_rendered_image = self.get_rendered_image()
self.current_episode_steps_counter += 1
if self.phase != RunPhase.UNDEFINED:
self.total_steps_counter += 1
# act
self._take_action(action)
# observe
self._update_state()
if self.is_rendered:
self.render()
self.total_reward_in_current_episode += self.reward
if self.visualization_parameters.add_rendered_image_to_env_response:
self.info['image'] = current_rendered_image
self.last_env_response = \
EnvResponse(
reward=self.reward,
next_state=self.state,
goal=self.goal,
game_over=self.done,
info=self.info
)
# store observations for video / gif dumping
if self.should_dump_video_of_the_current_episode(episode_terminated=False) and \
(self.visualization_parameters.dump_mp4 or self.visualization_parameters.dump_gifs):
self.last_episode_images.append(self.get_rendered_image())
return self.last_env_response
def render(self) -> None:
"""
Call the environment function for rendering to the screen
"""
if self.native_rendering:
self._render()
else:
self.renderer.render_image(self.get_rendered_image())
def reset_internal_state(self, force_environment_reset=False) -> EnvResponse:
"""
Reset the environment and all the variable of the wrapper
:param force_environment_reset: forces environment reset even when the game did not end
:return: A dictionary containing the observation, reward, done flag, action and measurements
"""
self.dump_video_of_last_episode_if_needed()
self._restart_environment_episode(force_environment_reset)
self.last_episode_time = time.time()
if self.current_episode_steps_counter > 0 and self.phase != RunPhase.UNDEFINED:
self.episode_idx += 1
self.done = False
self.total_reward_in_current_episode = self.reward = 0.0
self.last_action = 0
self.current_episode_steps_counter = 0
self.last_episode_images = []
self._update_state()
# render before the preprocessing of the observation, so that the image will be in its original quality
if self.is_rendered:
self.render()
self.last_env_response = \
EnvResponse(
reward=self.reward,
next_state=self.state,
goal=self.goal,
game_over=self.done,
info=self.info
)
return self.last_env_response
def get_random_action(self) -> ActionType:
"""
Returns an action picked uniformly from the available actions
:return: a numpy array with a random action
"""
return self.action_space.sample()
def get_available_keys(self) -> List[Tuple[str, ActionType]]:
"""
Return a list of tuples mapping between action names and the keyboard key that triggers them
:return: a list of tuples mapping between action names and the keyboard key that triggers them
"""
available_keys = []
if self.key_to_action != {}:
for key, idx in sorted(self.key_to_action.items(), key=operator.itemgetter(1)):
if key != ():
key_names = [self.renderer.get_key_names([k])[0] for k in key]
available_keys.append((self.action_space.descriptions[idx], ' + '.join(key_names)))
elif type(self.action_space) == DiscreteActionSpace:
for action in range(self.action_space.shape):
available_keys.append(("Action {}".format(action + 1), action + 1))
return available_keys
def get_goal(self) -> GoalType:
"""
Get the current goal that the agents needs to achieve in the environment
:return: The goal
"""
return self.goal
def set_goal(self, goal: GoalType) -> None:
"""
Set the current goal that the agent needs to achieve in the environment
:param goal: the goal that needs to be achieved
:return: None
"""
self.goal = goal
def should_dump_video_of_the_current_episode(self, episode_terminated=False):
if self.visualization_parameters.video_dump_methods:
for video_dump_method in force_list(self.visualization_parameters.video_dump_methods):
if not video_dump_method.should_dump(episode_terminated, **self.__dict__):
return False
return True
return False
def dump_video_of_last_episode_if_needed(self):
if self.visualization_parameters.video_dump_methods and self.last_episode_images != []:
if self.should_dump_video_of_the_current_episode(episode_terminated=True):
self.dump_video_of_last_episode()
def dump_video_of_last_episode(self):
frame_skipping = max(1, int(5 / self.frame_skip))
file_name = 'episode-{}_score-{}'.format(self.episode_idx, self.total_reward_in_current_episode)
fps = 10
if self.visualization_parameters.dump_gifs:
logger.create_gif(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
if self.visualization_parameters.dump_mp4:
logger.create_mp4(self.last_episode_images[::frame_skipping], name=file_name, fps=fps)
def log_to_screen(self):
# log to screen
log = OrderedDict()
log["Episode"] = self.episode_idx
log["Total reward"] = np.round(self.total_reward_in_current_episode, 2)
log["Steps"] = self.total_steps_counter
screen.log_dict(log, prefix=self.phase.value)
# The following functions define the interaction with the environment.
# Any new environment that inherits the Environment class should use these signatures.
# Some of these functions are optional - please read their description for more details.
def _take_action(self, action_idx: ActionType) -> None:
"""
An environment dependent function that sends an action to the simulator.
:param action_idx: the action to perform on the environment
:return: None
"""
raise NotImplementedError("")
def _update_state(self) -> None:
"""
Updates the state from the environment.
Should update self.observation, self.reward, self.done, self.measurements and self.info
:return: None
"""
raise NotImplementedError("")
def _restart_environment_episode(self, force_environment_reset=False) -> None:
"""
Restarts the simulator episode
:param force_environment_reset: Force the environment to reset even if the episode is not done yet.
:return: None
"""
raise NotImplementedError("")
def _render(self) -> None:
"""
Renders the environment using the native simulator renderer
:return: None
"""
pass
def get_rendered_image(self) -> np.ndarray:
"""
Return a numpy array containing the image that will be rendered to the screen.
This can be different from the observation. For example, mujoco's observation is a measurements vector.
:return: numpy array containing the image that will be rendered to the screen
"""
return np.transpose(self.state['observation'], [1, 2, 0])
"""
Video Dumping Methods
"""
class VideoDumpMethod(object):
"""
Method used to decide when to dump videos
"""
def should_dump(self, episode_terminated=False, **kwargs):
raise NotImplementedError("")
class AlwaysDumpMethod(VideoDumpMethod):
"""
Dump video for every episode
"""
def __init__(self):
super().__init__()
def should_dump(self, episode_terminated=False, **kwargs):
return True
class MaxDumpMethod(VideoDumpMethod):
"""
Dump video every time a new max total reward has been achieved
"""
def __init__(self):
super().__init__()
self.max_reward_achieved = -np.inf
def should_dump(self, episode_terminated=False, **kwargs):
# if the episode has not finished yet we want to be prepared for dumping a video
if not episode_terminated:
return True
if kwargs['total_reward_in_current_episode'] > self.max_reward_achieved:
self.max_reward_achieved = kwargs['total_reward_in_current_episode']
return True
else:
return False
class EveryNEpisodesDumpMethod(object):
"""
Dump videos once in every N episodes
"""
def __init__(self, num_episodes_between_dumps: int):
super().__init__()
self.num_episodes_between_dumps = num_episodes_between_dumps
self.last_dumped_episode = 0
if num_episodes_between_dumps < 1:
raise ValueError("the number of episodes between dumps should be a positive number")
def should_dump(self, episode_terminated=False, **kwargs):
if kwargs['episode_idx'] >= self.last_dumped_episode + self.num_episodes_between_dumps - 1:
self.last_dumped_episode = kwargs['episode_idx']
return True
else:
return False
class SelectedPhaseOnlyDumpMethod(object):
"""
Dump videos when the phase of the environment matches a predefined phase
"""
def __init__(self, run_phases: Union[RunPhase, List[RunPhase]]):
self.run_phases = force_list(run_phases)
def should_dump(self, episode_terminated=False, **kwargs):
if kwargs['_phase'] in self.run_phases:
return True
else:
return False

View File

@@ -0,0 +1,149 @@
########################################################################################################################
####### Currently we are ignoring more complex cases including EnvironmentGroups - DO NOT USE THIS FILE ****************
########################################################################################################################
# #
# # Copyright (c) 2017 Intel Corporation
# #
# # Licensed under the Apache License, Version 2.0 (the "License");
# # you may not use this file except in compliance with the License.
# # You may obtain a copy of the License at
# #
# # http://www.apache.org/licenses/LICENSE-2.0
# #
# # Unless required by applicable law or agreed to in writing, software
# # distributed under the License is distributed on an "AS IS" BASIS,
# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# # See the License for the specific language governing permissions and
# # limitations under the License.
# #
#
# from typing import Union, List, Dict
# import numpy as np
# from environments import create_environment
# from environments.environment import Environment
# from environments.environment_interface import EnvironmentInterface, ActionType, ActionSpace
# from core_types import GoalType, Transition
#
#
# class EnvironmentGroup(EnvironmentInterface):
# """
# An EnvironmentGroup is a group of different environments.
# In the simple case, it will contain a single environment. But it can also contain multiple environments,
# where the agent can then act on them as a batch, such that the prediction of the action is more efficient.
# """
# def __init__(self, environments_parameters: List[Environment]):
# self.environments_parameters = environments_parameters
# self.environments = []
# self.action_space = []
# self.outgoing_control = []
# self._last_env_response = []
#
# @property
# def action_space(self) -> Union[List[ActionSpace], ActionSpace]:
# """
# Get the action space of the environment
# :return: the action space
# """
# return self.action_space
#
# @action_space.setter
# def action_space(self, val: Union[List[ActionSpace], ActionSpace]):
# """
# Set the action space of the environment
# :return: None
# """
# self.action_space = val
#
# @property
# def phase(self) -> RunPhase:
# """
# Get the phase of the environments group
# :return: the current phase
# """
# return self.phase
#
# @phase.setter
# def phase(self, val: RunPhase):
# """
# Change the phase of each one of the environments in the group
# :param val: the new phase
# :return: None
# """
# self.phase = val
# call_method_for_all(self.environments, 'phase', val)
#
# def _create_environments(self):
# """
# Create the environments using the given parameters and update the environments list
# :return: None
# """
# for environment_parameters in self.environments_parameters:
# environment = create_environment(environment_parameters)
# self.action_space = self.action_space.append(environment.action_space)
# self.environments.append(environment)
#
# @property
# def last_env_response(self) -> Union[List[Transition], Transition]:
# """
# Get the last environment response
# :return: a dictionary that contains the state, reward, etc.
# """
# return squeeze_list(self._last_env_response)
#
# @last_env_response.setter
# def last_env_response(self, val: Union[List[Transition], Transition]):
# """
# Set the last environment response
# :param val: the last environment response
# """
# self._last_env_response = force_list(val)
#
# def step(self, actions: Union[List[ActionType], ActionType]) -> List[Transition]:
# """
# Act in all the environments in the group.
# :param actions: can be either a single action if there is a single environment in the group, or a list of
# actions in case there are multiple environments in the group. Each action can be an action index
# or a numpy array representing a continuous action for example.
# :return: The responses from all the environments in the group
# """
#
# actions = force_list(actions)
# if len(actions) != len(self.environments):
# raise ValueError("The number of actions does not match the number of environments in the group")
#
# result = []
# for environment, action in zip(self.environments, actions):
# result.append(environment.step(action))
#
# self.last_env_response = result
#
# return result
#
# def reset(self, force_environment_reset: bool=False) -> List[Transition]:
# """
# Reset all the environments in the group
# :param force_environment_reset: force the reset of each one of the environments
# :return: a list of the environments responses
# """
# return call_method_for_all(self.environments, 'reset', force_environment_reset)
#
# def get_random_action(self) -> List[ActionType]:
# """
# Get a list of random action that can be applied on the environments in the group
# :return: a list of random actions
# """
# return call_method_for_all(self.environments, 'get_random_action')
#
# def set_goal(self, goal: GoalType) -> None:
# """
# Set the goal of each one of the environments in the group to be the given goal
# :param goal: a goal vector
# :return: None
# """
# # TODO: maybe enable setting multiple goals?
# call_method_for_all(self.environments, 'set_goal', goal)

View File

@@ -0,0 +1,76 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union, Dict
from rl_coach.spaces import ActionSpace
from rl_coach.core_types import ActionType, EnvResponse, RunPhase
class EnvironmentInterface(object):
def __init__(self):
self._phase = RunPhase.UNDEFINED
@property
def phase(self) -> RunPhase:
"""
Get the phase of the environment
:return: the current phase
"""
return self._phase
@phase.setter
def phase(self, val: RunPhase):
"""
Change the phase of the environment
:param val: the new phase
:return: None
"""
self._phase = val
@property
def action_space(self) -> Union[Dict[str, ActionSpace], ActionSpace]:
"""
Get the action space of the environment (or of each of the agents wrapped in this environment.
i.e. in the LevelManager case")
:return: the action space
"""
raise NotImplementedError("")
def get_random_action(self) -> ActionType:
"""
Get a random action from the environment action space
:return: An action that follows the definition of the action space.
"""
raise NotImplementedError("")
def step(self, action: ActionType) -> Union[None, EnvResponse]:
"""
Make a single step in the environment using the given action
:param action: an action to use for stepping the environment. Should follow the definition of the action space.
:return: the environment response as returned in get_last_env_response or None for LevelManager
"""
raise NotImplementedError("")
def reset_internal_state(self, force_environment_reset: bool=False) -> Union[None, EnvResponse]:
"""
Reset the environment episode
:param force_environment_reset: in some cases, resetting the environment can be suppressed by the environment
itself. This flag allows force the reset.
:return: the environment response as returned in get_last_env_response or None for LevelManager
"""
raise NotImplementedError("")

View File

@@ -0,0 +1,454 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import gym
import numpy as np
import scipy.ndimage
from rl_coach.utils import lower_under_to_upper, short_dynamic_import
try:
import roboschool
from OpenGL import GL
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("RoboSchool")
try:
from rl_coach.gym_extensions.continuous import mujoco
except:
from rl_coach.logger import failed_imports
failed_imports.append("GymExtensions")
try:
import pybullet_envs
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("PyBullet")
from typing import Dict, Any, Union
from rl_coach.core_types import RunPhase
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace, ImageObservationSpace, VectorObservationSpace, \
StateSpace, RewardSpace
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
from rl_coach.filters.reward.reward_clipping_filter import RewardClippingFilter
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
from rl_coach.filters.filter import InputFilter
import random
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.logger import screen
# Parameters
class GymEnvironmentParameters(EnvironmentParameters):
def __init__(self):
super().__init__()
self.random_initialization_steps = 0
self.max_over_num_frames = 1
self.additional_simulator_parameters = None
@property
def path(self):
return 'rl_coach.environments.gym_environment:GymEnvironment'
"""
Roboschool Environment Components
"""
RoboSchoolInputFilters = NoInputFilter()
RoboSchoolOutputFilters = NoOutputFilter()
class Roboschool(GymEnvironmentParameters):
def __init__(self):
super().__init__()
self.frame_skip = 1
self.default_input_filter = RoboSchoolInputFilters
self.default_output_filter = RoboSchoolOutputFilters
gym_roboschool_envs = ['inverted_pendulum', 'inverted_pendulum_swingup', 'inverted_double_pendulum', 'reacher',
'hopper', 'walker2d', 'half_cheetah', 'ant', 'humanoid', 'humanoid_flagrun',
'humanoid_flagrun_harder', 'pong']
roboschool_v0 = {e: "{}".format(lower_under_to_upper(e) + '-v0') for e in gym_roboschool_envs}
"""
Mujoco Environment Components
"""
MujocoInputFilter = NoInputFilter()
MujocoOutputFilter = NoOutputFilter()
class Mujoco(GymEnvironmentParameters):
def __init__(self):
super().__init__()
self.frame_skip = 1
self.default_input_filter = MujocoInputFilter
self.default_output_filter = MujocoOutputFilter
gym_mujoco_envs = ['inverted_pendulum', 'inverted_double_pendulum', 'reacher', 'hopper', 'walker2d', 'half_cheetah',
'ant', 'swimmer', 'humanoid', 'humanoid_standup', 'pusher', 'thrower', 'striker']
mujoco_v2 = {e: "{}".format(lower_under_to_upper(e) + '-v2') for e in gym_mujoco_envs}
mujoco_v2['walker2d'] = 'Walker2d-v2'
gym_fetch_envs = ['reach', 'slide', 'push', 'pick_and_place']
fetch_v1 = {e: "{}".format('Fetch' + lower_under_to_upper(e) + '-v1') for e in gym_fetch_envs}
"""
Bullet Environment Components
"""
BulletInputFilter = NoInputFilter()
BulletOutputFilter = NoOutputFilter()
class Bullet(GymEnvironmentParameters):
def __init__(self):
super().__init__()
self.frame_skip = 1
self.default_input_filter = BulletInputFilter
self.default_output_filter = BulletOutputFilter
"""
Atari Environment Components
"""
AtariInputFilter = InputFilter(is_a_reference_filter=True)
AtariInputFilter.add_reward_filter('clipping', RewardClippingFilter(-1.0, 1.0))
AtariInputFilter.add_observation_filter('observation', 'rescaling',
ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]),
high=255)))
AtariInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
AtariInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
AtariInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(4))
AtariOutputFilter = NoOutputFilter()
class Atari(GymEnvironmentParameters):
def __init__(self):
super().__init__()
self.frame_skip = 4
self.max_over_num_frames = 2
self.random_initialization_steps = 30
self.default_input_filter = AtariInputFilter
self.default_output_filter = AtariOutputFilter
gym_atari_envs = ['air_raid', 'alien', 'amidar', 'assault', 'asterix', 'asteroids', 'atlantis',
'bank_heist', 'battle_zone', 'beam_rider', 'berzerk', 'bowling', 'boxing', 'breakout', 'carnival',
'centipede', 'chopper_command', 'crazy_climber', 'demon_attack', 'double_dunk',
'elevator_action', 'enduro', 'fishing_derby', 'freeway', 'frostbite', 'gopher', 'gravitar',
'hero', 'ice_hockey', 'jamesbond', 'journey_escape', 'kangaroo', 'krull', 'kung_fu_master',
'montezuma_revenge', 'ms_pacman', 'name_this_game', 'phoenix', 'pitfall', 'pong', 'pooyan',
'private_eye', 'qbert', 'riverraid', 'road_runner', 'robotank', 'seaquest', 'skiing',
'solaris', 'space_invaders', 'star_gunner', 'tennis', 'time_pilot', 'tutankham', 'up_n_down',
'venture', 'video_pinball', 'wizard_of_wor', 'yars_revenge', 'zaxxon']
atari_deterministic_v4 = {e: "{}".format(lower_under_to_upper(e) + 'Deterministic-v4') for e in gym_atari_envs}
atari_no_frameskip_v4 = {e: "{}".format(lower_under_to_upper(e) + 'NoFrameskip-v4') for e in gym_atari_envs}
class MaxOverFramesAndFrameskipEnvWrapper(gym.Wrapper):
def __init__(self, env, frameskip=4, max_over_num_frames=2):
super().__init__(env)
self.max_over_num_frames = max_over_num_frames
self.observations_stack = []
self.frameskip = frameskip
self.first_frame_to_max_over = self.frameskip - self.max_over_num_frames
def reset(self):
return self.env.reset()
def step(self, action):
total_reward = 0.0
done = None
info = None
self.observations_stack = []
for i in range(self.frameskip):
observation, reward, done, info = self.env.step(action)
if i >= self.first_frame_to_max_over:
self.observations_stack.append(observation)
total_reward += reward
if done:
# deal with last state in episode
if not self.observations_stack:
self.observations_stack.append(observation)
break
max_over_frames_observation = np.max(self.observations_stack, axis=0)
return max_over_frames_observation, total_reward, done, info
# Environment
class GymEnvironment(Environment):
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
additional_simulator_parameters: Dict[str, Any] = None, seed: Union[None, int]=None,
human_control: bool=False, custom_reward_threshold: Union[int, float]=None,
random_initialization_steps: int=1, max_over_num_frames: int=1, **kwargs):
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold,
visualization_parameters)
self.random_initialization_steps = random_initialization_steps
self.max_over_num_frames = max_over_num_frames
self.additional_simulator_parameters = additional_simulator_parameters
# hide warnings
gym.logger.set_level(40)
"""
load and initialize environment
environment ids can be defined in 3 ways:
1. Native gym environments like BreakoutDeterministic-v0 for example
2. Custom gym environments written and installed as python packages.
This environments should have a python module with a class inheriting gym.Env, implementing the
relevant functions (_reset, _step, _render) and defining the observation and action space
For example: my_environment_package:MyEnvironmentClass will run an environment defined in the
MyEnvironmentClass class
3. Custom gym environments written as an independent module which is not installed.
This environments should have a python module with a class inheriting gym.Env, implementing the
relevant functions (_reset, _step, _render) and defining the observation and action space.
For example: path_to_my_environment.sub_directory.my_module:MyEnvironmentClass will run an
environment defined in the MyEnvironmentClass class which is located in the module in the relative path
path_to_my_environment.sub_directory.my_module
"""
if ':' in self.env_id:
# custom environments
if '/' in self.env_id or '.' in self.env_id:
# environment in a an absolute path module written as a unix path or in a relative path module
# written as a python import path
env_class = short_dynamic_import(self.env_id)
else:
# environment in a python package
env_class = gym.envs.registration.load(self.env_id)
# instantiate the environment
if self.additional_simulator_parameters:
self.env = env_class(**self.additional_simulator_parameters)
else:
self.env = env_class()
else:
self.env = gym.make(self.env_id)
# for classic control we want to use the native renderer because otherwise we will get 2 renderer windows
environment_to_always_use_with_native_rendering = ['classic_control', 'mujoco', 'robotics']
self.native_rendering = self.native_rendering or \
any([env in str(self.env.unwrapped.__class__)
for env in environment_to_always_use_with_native_rendering])
if self.native_rendering:
if hasattr(self, 'renderer'):
self.renderer.close()
# seed
if self.seed is not None:
self.env.seed(self.seed)
np.random.seed(self.seed)
random.seed(self.seed)
# frame skip and max between consecutive frames
self.is_robotics_env = 'robotics' in str(self.env.unwrapped.__class__)
self.is_mujoco_env = 'mujoco' in str(self.env.unwrapped.__class__)
self.is_atari_env = 'Atari' in str(self.env.unwrapped.__class__)
self.timelimit_env_wrapper = self.env
if self.is_atari_env:
self.env.unwrapped.frameskip = 1 # this accesses the atari env that is wrapped with a timelimit wrapper env
if self.env_id == "SpaceInvadersDeterministic-v4" and self.frame_skip == 4:
screen.warning("Warning: The frame-skip for Space Invaders was automatically updated from 4 to 3. "
"This is following the DQN paper where it was noticed that a frame-skip of 3 makes the "
"laser rays disappear. To force frame-skip of 4, please use SpaceInvadersNoFrameskip-v4.")
self.frame_skip = 3
self.env = MaxOverFramesAndFrameskipEnvWrapper(self.env,
frameskip=self.frame_skip,
max_over_num_frames=self.max_over_num_frames)
else:
self.env.unwrapped.frameskip = self.frame_skip
self.state_space = StateSpace({})
# observations
if not isinstance(self.env.observation_space, gym.spaces.dict_space.Dict):
state_space = {'observation': self.env.observation_space}
else:
state_space = self.env.observation_space.spaces
for observation_space_name, observation_space in state_space.items():
if len(observation_space.shape) == 3 and observation_space.shape[-1] == 3:
# we assume gym has image observations which are RGB and where their values are within 0-255
self.state_space[observation_space_name] = ImageObservationSpace(
shape=np.array(observation_space.shape),
high=255,
channels_axis=-1
)
else:
self.state_space[observation_space_name] = VectorObservationSpace(
shape=observation_space.shape[0],
low=observation_space.low,
high=observation_space.high
)
if 'desired_goal' in state_space.keys():
self.goal_space = self.state_space['desired_goal']
# actions
if type(self.env.action_space) == gym.spaces.box.Box:
self.action_space = BoxActionSpace(
shape=self.env.action_space.shape,
low=self.env.action_space.low,
high=self.env.action_space.high
)
elif type(self.env.action_space) == gym.spaces.discrete.Discrete:
actions_description = []
if hasattr(self.env.unwrapped, 'get_action_meanings'):
actions_description = self.env.unwrapped.get_action_meanings()
self.action_space = DiscreteActionSpace(
num_actions=self.env.action_space.n,
descriptions=actions_description
)
if self.human_control:
# TODO: add this to the action space
# map keyboard keys to actions
self.key_to_action = {}
if hasattr(self.env.unwrapped, 'get_keys_to_action'):
self.key_to_action = self.env.unwrapped.get_keys_to_action()
# initialize the state by getting a new state from the environment
self.reset_internal_state(True)
# render
if self.is_rendered:
image = self.get_rendered_image()
scale = 1
if self.human_control:
scale = 2
if not self.native_rendering:
self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)
# measurements
if self.env.spec is not None:
self.timestep_limit = self.env.spec.timestep_limit
else:
self.timestep_limit = None
# the info is only updated after the first step
self.state = self.step(self.action_space.default_action).next_state
self.state_space['measurements'] = VectorObservationSpace(shape=len(self.info.keys()))
if self.env.spec and custom_reward_threshold is None:
self.reward_success_threshold = self.env.spec.reward_threshold
self.reward_space = RewardSpace(1, reward_success_threshold=self.reward_success_threshold)
def _wrap_state(self, state):
if not isinstance(self.env.observation_space, gym.spaces.Dict):
return {'observation': state}
return state
def _update_state(self):
if self.is_atari_env and hasattr(self, 'current_ale_lives') \
and self.current_ale_lives != self.env.unwrapped.ale.lives():
if self.phase == RunPhase.TRAIN or self.phase == RunPhase.HEATUP:
# signal termination for life loss
self.done = True
elif self.phase == RunPhase.TEST and not self.done:
# the episode is not terminated in evaluation, but we need to press fire again
self._press_fire()
self._update_ale_lives()
# TODO: update the measurements
if self.state and "desired_goal" in self.state.keys():
self.goal = self.state['desired_goal']
def _take_action(self, action):
if type(self.action_space) == BoxActionSpace:
action = self.action_space.clip_action_to_space(action)
self.state, self.reward, self.done, self.info = self.env.step(action)
self.state = self._wrap_state(self.state)
def _random_noop(self):
# simulate a random initial environment state by stepping for a random number of times between 0 and 30
step_count = 0
random_initialization_steps = random.randint(0, self.random_initialization_steps)
while self.action_space is not None and (self.state is None or step_count < random_initialization_steps):
step_count += 1
self.step(self.action_space.default_action)
def _press_fire(self):
fire_action = 1
if self.is_atari_env and self.env.unwrapped.get_action_meanings()[fire_action] == 'FIRE':
self.current_ale_lives = self.env.unwrapped.ale.lives()
self.step(fire_action)
if self.done:
self.reset_internal_state()
def _update_ale_lives(self):
if self.is_atari_env:
self.current_ale_lives = self.env.unwrapped.ale.lives()
def _restart_environment_episode(self, force_environment_reset=False):
# prevent reset of environment if there are ale lives left
if (self.is_atari_env and self.env.unwrapped.ale.lives() > 0) \
and not force_environment_reset and not self.timelimit_env_wrapper._past_limit():
self.step(self.action_space.default_action)
else:
self.state = self.env.reset()
self.state = self._wrap_state(self.state)
self._update_ale_lives()
if self.is_atari_env:
self._random_noop()
self._press_fire()
# initialize the number of lives
self._update_ale_lives()
def _set_mujoco_camera(self, camera_idx: int):
"""
This function can be used to set the camera for rendering the mujoco simulator
:param camera_idx: The index of the camera to use. Should be defined in the model
:return: None
"""
if self.env.unwrapped.viewer.cam.fixedcamid != camera_idx and self.env.unwrapped.viewer._ncam > camera_idx:
from mujoco_py.generated import const
self.env.unwrapped.viewer.cam.type = const.CAMERA_FIXED
self.env.unwrapped.viewer.cam.fixedcamid = camera_idx
def _get_robotics_image(self):
self.env.render()
image = self.env.unwrapped._get_viewer().read_pixels(1600, 900, depth=False)[::-1, :, :]
image = scipy.misc.imresize(image, (270, 480, 3))
return image
def _render(self):
self.env.render(mode='human')
# required for setting up a fixed camera for mujoco
if self.is_mujoco_env:
self._set_mujoco_camera(0)
def get_rendered_image(self):
if self.is_robotics_env:
# necessary for fetch since the rendered image is cropped to an irrelevant part of the simulator
image = self._get_robotics_image()
else:
image = self.env.render(mode='rgb_array')
# required for setting up a fixed camera for mujoco
if self.is_mujoco_env:
self._set_mujoco_camera(0)
return image

View File

View File

@@ -0,0 +1,38 @@
# Copyright 2017 The dm_control Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Functions to manage the common assets for domains."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from dm_control.utils import resources
_SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
_FILENAMES = [
"common/materials.xml",
"common/skybox.xml",
"common/visual.xml",
]
ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
for filename in _FILENAMES}
def read_model(model_filename):
"""Reads a model XML file and returns its contents as a string."""
return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))

View File

@@ -0,0 +1,22 @@
<!--
Common textures, colors and materials to be used throughout this suite. Some
materials such as xxx_highlight are activated on occurence of certain events,
for example receiving a positive reward.
-->
<mujoco>
<asset>
<texture name="grid" type="2d" builtin="checker" rgb1=".1 .2 .3" rgb2=".2 .3 .4" width="300" height="300" mark="edge" markrgb=".2 .3 .4"/>
<material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
<material name="self" rgba=".7 .5 .3 1"/>
<material name="self_default" rgba=".7 .5 .3 1"/>
<material name="self_highlight" rgba="0 .5 .3 1"/>
<material name="effector" rgba=".7 .4 .2 1"/>
<material name="effector_default" rgba=".7 .4 .2 1"/>
<material name="effector_highlight" rgba="0 .5 .3 1"/>
<material name="decoration" rgba=".3 .5 .7 1"/>
<material name="eye" rgba="0 .2 1 1"/>
<material name="target" rgba=".6 .3 .3 1"/>
<material name="target_default" rgba=".6 .3 .3 1"/>
<material name="target_highlight" rgba=".6 .3 .3 .4"/>
</asset>
</mujoco>

View File

@@ -0,0 +1,6 @@
<mujoco>
<asset>
<texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0"
width="800" height="800" mark="random" markrgb="1 1 1"/>
</asset>
</mujoco>

View File

@@ -0,0 +1,7 @@
<mujoco>
<visual>
<headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1"/>
<map znear=".01"/>
<quality shadowsize="2048"/>
</visual>
</mujoco>

View File

@@ -0,0 +1,185 @@
import numpy as np
import gym
import os
from gym import spaces
from gym.envs.registration import EnvSpec
from mujoco_py import load_model_from_path, MjSim , MjViewer, MjRenderContextOffscreen
class PendulumWithGoals(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
}
def __init__(self, goal_reaching_thresholds=np.array([0.075, 0.075, 0.75]),
goal_not_reached_penalty=-1, goal_reached_reward=0, terminate_on_goal_reaching=True,
time_limit=1000, frameskip=1, random_goals_instead_of_standing_goal=False,
polar_coordinates: bool=False):
super().__init__()
dir = os.path.dirname(__file__)
model = load_model_from_path(dir + "/pendulum_with_goals.xml")
self.sim = MjSim(model)
self.viewer = None
self.rgb_viewer = None
self.frameskip = frameskip
self.goal = None
self.goal_reaching_thresholds = goal_reaching_thresholds
self.goal_not_reached_penalty = goal_not_reached_penalty
self.goal_reached_reward = goal_reached_reward
self.terminate_on_goal_reaching = terminate_on_goal_reaching
self.time_limit = time_limit
self.current_episode_steps_counter = 0
self.random_goals_instead_of_standing_goal = random_goals_instead_of_standing_goal
self.polar_coordinates = polar_coordinates
# spaces definition
self.action_space = spaces.Box(low=-self.sim.model.actuator_ctrlrange[:, 1],
high=self.sim.model.actuator_ctrlrange[:, 1],
dtype=np.float32)
if self.polar_coordinates:
self.observation_space = spaces.Dict({
"observation": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32),
"desired_goal": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32),
"achieved_goal": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32)
})
else:
self.observation_space = spaces.Dict({
"observation": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32),
"desired_goal": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32),
"achieved_goal": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32)
})
self.spec = EnvSpec('PendulumWithGoals-v0')
self.spec.reward_threshold = self.goal_not_reached_penalty * self.time_limit
self.reset()
def _goal_reached(self):
observation = self._get_obs()
if np.any(np.abs(observation['achieved_goal'] - observation['desired_goal']) > self.goal_reaching_thresholds):
return False
else:
return True
def _terminate(self):
if (self._goal_reached() and self.terminate_on_goal_reaching) or \
self.current_episode_steps_counter >= self.time_limit:
return True
else:
return False
def _reward(self):
if self._goal_reached():
return self.goal_reached_reward
else:
return self.goal_not_reached_penalty
def step(self, action):
self.sim.data.ctrl[:] = action
for _ in range(self.frameskip):
self.sim.step()
self.current_episode_steps_counter += 1
state = self._get_obs()
# visualize the angular velocities
state_velocity = np.copy(state['observation'][-1] / 20)
goal_velocity = self.goal[-1] / 20
self.sim.model.site_size[2] = np.array([0.01, 0.01, state_velocity])
self.sim.data.mocap_pos[2] = np.array([0.85, 0, 0.75 + state_velocity])
self.sim.model.site_size[3] = np.array([0.01, 0.01, goal_velocity])
self.sim.data.mocap_pos[3] = np.array([1.15, 0, 0.75 + goal_velocity])
return state, self._reward(), self._terminate(), {}
def _get_obs(self):
"""
y
^
|____
| /
| /
|~/
|/
--------> x
"""
# observation
angle = self.sim.data.qpos
angular_velocity = self.sim.data.qvel
if self.polar_coordinates:
observation = np.concatenate([angle - np.pi, angular_velocity])
else:
x = np.sin(angle)
y = np.cos(angle) # qpos is the angle relative to a standing pole
observation = np.concatenate([x, y, angular_velocity])
return {
"observation": observation,
"desired_goal": self.goal,
"achieved_goal": observation
}
def reset(self):
self.current_episode_steps_counter = 0
# set initial state
angle = np.random.uniform(np.pi / 4, 7 * np.pi / 4)
angular_velocity = np.random.uniform(-0.05, 0.05)
self.sim.data.qpos[0] = angle
self.sim.data.qvel[0] = angular_velocity
self.sim.step()
# goal
if self.random_goals_instead_of_standing_goal:
angle_target = np.random.uniform(-np.pi / 8, np.pi / 8)
angular_velocity_target = np.random.uniform(-0.2, 0.2)
else:
angle_target = 0
angular_velocity_target = 0
# convert target values to goal
x_target = np.sin(angle_target)
y_target = np.cos(angle_target)
if self.polar_coordinates:
self.goal = np.array([angle_target - np.pi, angular_velocity_target])
else:
self.goal = np.array([x_target, y_target, angular_velocity_target])
# visualize the goal
self.sim.data.mocap_pos[0] = [x_target, 0, y_target]
return self._get_obs()
def render(self, mode='human', close=False):
if mode == 'human':
if self.viewer is None:
self.viewer = MjViewer(self.sim)
self.viewer.render()
elif mode == 'rgb_array':
if self.rgb_viewer is None:
self.rgb_viewer = MjRenderContextOffscreen(self.sim, 0)
self.rgb_viewer.render(500, 500)
# window size used for old mujoco-py:
data = self.rgb_viewer.read_pixels(500, 500, depth=False)
# original image is upside-down, so flip it
return data[::-1, :, :]

View File

@@ -0,0 +1,42 @@
<mujoco model="pendulum_with_goals">
<include file="./common/visual.xml"/>
<include file="./common/skybox.xml"/>
<include file="./common/materials.xml"/>
<option timestep="0.002">
<flag contact="disable" energy="enable"/>
</option>
<worldbody>
<light name="light" pos="0 0 2"/>
<geom name="floor" size="2 2 .2" type="plane" material="grid"/>
<camera name="fixed" pos="0 -1.5 2" xyaxes='1 0 0 0 1 1'/>
<camera name="lookat" mode="targetbodycom" target="pole" pos="0 -2 1"/>
<body name="pole" pos="0 0 .6">
<joint name="hinge" type="hinge" axis="0 1 0" damping="0.1"/>
<geom name="base" material="decoration" type="cylinder" fromto="0 -.03 0 0 .03 0" size="0.021" mass="0"/>
<geom name="pole" material="self" type="capsule" fromto="0 0 0 0 0 0.5" size="0.02" mass="0"/>
<geom name="mass" material="effector" type="sphere" pos="0 0 0.5" size="0.05" mass="1"/>
</body>
<body name="end_goal" pos="0 0 0" mocap="true">
<site type="sphere" size="0.05" rgba="1 1 0 1" />
</body>
<!--<body name="sub_goal" pos="0 0 0" mocap="true">-->
<!--<site type="sphere" size="0.05" rgba="1 0 1 1" />-->
<!--</body>-->
<body name="current_velo" pos="0.0 0 0.0" mocap="true">
<site type="box" size="0.01 0.01 0.1" rgba="1 1 1 1" />
</body>
<body name="subgoal_velo" pos="0.0 0 0.0" mocap="true">
<site type="box" size="0.01 0.01 0.1" rgba="1 0 1 1" />
</body>
<body name="zero_velo" pos="1.0 0 0.75" mocap="true">
<site type="box" size="0.3 0.01 0.01" rgba="1 0 0 1" />
</body>
</worldbody>
<actuator>
<motor name="torque" joint="hinge" gear="1" ctrlrange="-2 2" ctrllimited="true"/>
</actuator>
</mujoco>

View File

@@ -0,0 +1,245 @@
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from enum import Enum
from typing import Union, List
import numpy as np
from rl_coach.filters.observation.observation_move_axis_filter import ObservationMoveAxisFilter
try:
from pysc2 import maps
from pysc2.env import sc2_env
from pysc2.env import available_actions_printer
from pysc2.lib import actions
from pysc2.lib import features
from pysc2.env import environment
from absl import app
from absl import flags
except ImportError:
from rl_coach.logger import failed_imports
failed_imports.append("PySc2")
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.spaces import BoxActionSpace, VectorObservationSpace, PlanarMapsObservationSpace, StateSpace, CompoundActionSpace, \
DiscreteActionSpace
from rl_coach.filters.filter import InputFilter, OutputFilter
from rl_coach.filters.observation.observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
from rl_coach.filters.action.linear_box_to_box_map import LinearBoxToBoxMap
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
FLAGS = flags.FLAGS
FLAGS(['coach.py'])
SCREEN_SIZE = 84 # will also impact the action space size
# Starcraft Constants
_NOOP = actions.FUNCTIONS.no_op.id
_MOVE_SCREEN = actions.FUNCTIONS.Move_screen.id
_SELECT_ARMY = actions.FUNCTIONS.select_army.id
_PLAYER_RELATIVE = features.SCREEN_FEATURES.player_relative.index
_NOT_QUEUED = [0]
_SELECT_ALL = [0]
class StarcraftObservationType(Enum):
Features = 0
RGB = 1
StarcraftInputFilter = InputFilter(is_a_reference_filter=True)
StarcraftInputFilter.add_observation_filter('screen', 'move_axis', ObservationMoveAxisFilter(0, -1))
StarcraftInputFilter.add_observation_filter('screen', 'rescaling',
ObservationRescaleToSizeFilter(
PlanarMapsObservationSpace(np.array([84, 84, 1]),
low=0, high=255, channels_axis=-1)))
StarcraftInputFilter.add_observation_filter('screen', 'to_uint8', ObservationToUInt8Filter(0, 255))
StarcraftInputFilter.add_observation_filter('minimap', 'move_axis', ObservationMoveAxisFilter(0, -1))
StarcraftInputFilter.add_observation_filter('minimap', 'rescaling',
ObservationRescaleToSizeFilter(
PlanarMapsObservationSpace(np.array([64, 64, 1]),
low=0, high=255, channels_axis=-1)))
StarcraftInputFilter.add_observation_filter('minimap', 'to_uint8', ObservationToUInt8Filter(0, 255))
StarcraftNormalizingOutputFilter = OutputFilter(is_a_reference_filter=True)
StarcraftNormalizingOutputFilter.add_action_filter(
'normalization', LinearBoxToBoxMap(input_space_low=-SCREEN_SIZE / 2, input_space_high=SCREEN_SIZE / 2 - 1))
class StarCraft2EnvironmentParameters(EnvironmentParameters):
def __init__(self):
super().__init__()
self.screen_size = 84
self.minimap_size = 64
self.feature_minimap_maps_to_use = range(7)
self.feature_screen_maps_to_use = range(17)
self.observation_type = StarcraftObservationType.Features
self.disable_fog = False
self.auto_select_all_army = True
self.default_input_filter = StarcraftInputFilter
self.default_output_filter = StarcraftNormalizingOutputFilter
self.use_full_action_space = False
@property
def path(self):
return 'rl_coach.environments.starcraft2_environment:StarCraft2Environment'
# Environment
class StarCraft2Environment(Environment):
def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
seed: Union[None, int]=None, human_control: bool=False,
custom_reward_threshold: Union[int, float]=None,
screen_size: int=84, minimap_size: int=64,
feature_minimap_maps_to_use: List=range(7), feature_screen_maps_to_use: List=range(17),
observation_type: StarcraftObservationType=StarcraftObservationType.Features,
disable_fog: bool=False, auto_select_all_army: bool=True,
use_full_action_space: bool=False, **kwargs):
super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)
self.screen_size = screen_size
self.minimap_size = minimap_size
self.feature_minimap_maps_to_use = feature_minimap_maps_to_use
self.feature_screen_maps_to_use = feature_screen_maps_to_use
self.observation_type = observation_type
self.features_screen_size = None
self.feature_minimap_size = None
self.rgb_screen_size = None
self.rgb_minimap_size = None
if self.observation_type == StarcraftObservationType.Features:
self.features_screen_size = screen_size
self.feature_minimap_size = minimap_size
elif self.observation_type == StarcraftObservationType.RGB:
self.rgb_screen_size = screen_size
self.rgb_minimap_size = minimap_size
self.disable_fog = disable_fog
self.auto_select_all_army = auto_select_all_army
self.use_full_action_space = use_full_action_space
# step_mul is the equivalent to frame skipping. Not sure if it repeats actions in between or not though.
self.env = sc2_env.SC2Env(map_name=self.env_id, step_mul=frame_skip,
visualize=self.is_rendered,
agent_interface_format=sc2_env.AgentInterfaceFormat(
feature_dimensions=sc2_env.Dimensions(
screen=self.features_screen_size,
minimap=self.feature_minimap_size
)
# rgb_dimensions=sc2_env.Dimensions(
# screen=self.rgb_screen_size,
# minimap=self.rgb_screen_size
# )
),
# feature_screen_size=self.features_screen_size,
# feature_minimap_size=self.feature_minimap_size,
# rgb_screen_size=self.rgb_screen_size,
# rgb_minimap_size=self.rgb_screen_size,
disable_fog=disable_fog,
random_seed=self.seed
)
# print all the available actions
# self.env = available_actions_printer.AvailableActionsPrinter(self.env)
self.reset_internal_state(True)
"""
feature_screen: [height_map, visibility_map, creep, power, player_id, player_relative, unit_type, selected,
unit_hit_points, unit_hit_points_ratio, unit_energy, unit_energy_ratio, unit_shields,
unit_shields_ratio, unit_density, unit_density_aa, effects]
feature_minimap: [height_map, visibility_map, creep, camera, player_id, player_relative, selecte
d]
player: [player_id, minerals, vespene, food_cap, food_army, food_workers, idle_worker_dount,
army_count, warp_gate_count, larva_count]
"""
self.screen_shape = np.array(self.env.observation_spec()[0]['feature_screen'])
self.screen_shape[0] = len(self.feature_screen_maps_to_use)
self.minimap_shape = np.array(self.env.observation_spec()[0]['feature_minimap'])
self.minimap_shape[0] = len(self.feature_minimap_maps_to_use)
self.state_space = StateSpace({
"screen": PlanarMapsObservationSpace(shape=self.screen_shape, low=0, high=255, channels_axis=0),
"minimap": PlanarMapsObservationSpace(shape=self.minimap_shape, low=0, high=255, channels_axis=0),
"measurements": VectorObservationSpace(self.env.observation_spec()[0]["player"][0])
})
if self.use_full_action_space:
action_identifiers = list(self.env.action_spec()[0].functions)
num_action_identifiers = len(action_identifiers)
action_arguments = [(arg.name, arg.sizes) for arg in self.env.action_spec()[0].types]
sub_action_spaces = [DiscreteActionSpace(num_action_identifiers)]
for argument in action_arguments:
for dimension in argument[1]:
sub_action_spaces.append(DiscreteActionSpace(dimension))
self.action_space = CompoundActionSpace(sub_action_spaces)
else:
self.action_space = BoxActionSpace(2, 0, self.screen_size - 1, ["X-Axis, Y-Axis"],
default_action=np.array([self.screen_size/2, self.screen_size/2]))
def _update_state(self):
timestep = 0
self.screen = self.last_result[timestep].observation.feature_screen
# extract only the requested segmentation maps from the observation
self.screen = np.take(self.screen, self.feature_screen_maps_to_use, axis=0)
self.minimap = self.last_result[timestep].observation.feature_minimap
self.measurements = self.last_result[timestep].observation.player
self.reward = self.last_result[timestep].reward
self.done = self.last_result[timestep].step_type == environment.StepType.LAST
self.state = {
'screen': self.screen,
'minimap': self.minimap,
'measurements': self.measurements
}
def _take_action(self, action):
if self.use_full_action_space:
action_identifier = action[0]
action_arguments = action[1:]
action = actions.FunctionCall(action_identifier, action_arguments)
else:
coord = np.array(action[0:2])
noop = False
coord = coord.round()
coord = np.clip(coord, 0, SCREEN_SIZE - 1)
self.last_action_idx = coord
if noop:
action = actions.FunctionCall(_NOOP, [])
else:
action = actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, coord])
self.last_result = self.env.step(actions=[action])
def _restart_environment_episode(self, force_environment_reset=False):
# reset the environment
self.last_result = self.env.reset()
# select all the units on the screen
if self.auto_select_all_army:
self.env.step(actions=[actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])])
def get_rendered_image(self):
screen = np.squeeze(np.tile(np.expand_dims(self.screen, -1), (1, 1, 3)))
screen = screen / np.max(screen) * 255
return screen.astype('uint8')
def dump_video_of_last_episode(self):
from rl_coach.logger import experiment_path
self.env._run_config.replay_dir = experiment_path
self.env.save_replay('replays')
super().dump_video_of_last_episode()

View File

@@ -0,0 +1,82 @@
import numpy as np
import gym
from gym import spaces
import random
class BitFlip(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
}
def __init__(self, bit_length=16, max_steps=None, mean_zero=False):
super(BitFlip, self).__init__()
if bit_length < 1:
raise ValueError('bit_length must be >= 1, found {}'.format(bit_length))
self.bit_length = bit_length
self.mean_zero = mean_zero
if max_steps is None:
# default to bit_length
self.max_steps = bit_length
elif max_steps == 0:
self.max_steps = None
else:
self.max_steps = max_steps
# spaces documentation: https://gym.openai.com/docs/
self.action_space = spaces.Discrete(bit_length)
self.observation_space = spaces.Dict({
'state': spaces.Box(low=0, high=1, shape=(bit_length, )),
'desired_goal': spaces.Box(low=0, high=1, shape=(bit_length, )),
'achieved_goal': spaces.Box(low=0, high=1, shape=(bit_length, ))
})
self.reset()
def _terminate(self):
return (self.state == self.goal).all() or self.steps >= self.max_steps
def _reward(self):
return -1 if (self.state != self.goal).any() else 0
def step(self, action):
# action is an int in the range [0, self.bit_length)
self.state[action] = int(not self.state[action])
self.steps += 1
return (self._get_obs(), self._reward(), self._terminate(), {})
def reset(self):
self.steps = 0
self.state = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
# make sure goal is not the initial state
self.goal = self.state
while (self.goal == self.state).all():
self.goal = np.array([random.choice([1, 0]) for _ in range(self.bit_length)])
return self._get_obs()
def _mean_zero(self, x):
if self.mean_zero:
return (x - 0.5) / 0.5
else:
return x
def _get_obs(self):
return {
'state': self._mean_zero(self.state),
'desired_goal': self._mean_zero(self.goal),
'achieved_goal': self._mean_zero(self.state)
}
def render(self, mode='human', close=False):
observation = np.zeros((20, 20 * self.bit_length, 3))
for bit_idx, (state_bit, goal_bit) in enumerate(zip(self.state, self.goal)):
# green if the bit matches
observation[:, bit_idx * 20:(bit_idx + 1) * 20, 1] = (state_bit == goal_bit) * 255
# red if the bit doesn't match
observation[:, bit_idx * 20:(bit_idx + 1) * 20, 0] = (state_bit != goal_bit) * 255
return observation

Some files were not shown because too many files have changed in this diff Show More