From 125c7ee38d413e4709271292c473f5930196b137 Mon Sep 17 00:00:00 2001
From: Itai Caspi <30383381+itaicaspi-intel@users.noreply.github.com>
Date: Tue, 19 Dec 2017 19:27:16 +0200
Subject: [PATCH] Release 0.9
Main changes are detailed below:
New features -
* CARLA 0.7 simulator integration
* Human control of the game play
* Recording of human game play and storing / loading the replay buffer
* Behavioral cloning agent and presets
* Golden tests for several presets
* Selecting between deep / shallow image embedders
* Rendering through pygame (with some boost in performance)
API changes -
* Improved environment wrapper API
* Added an evaluate flag to allow convenient evaluation of existing checkpoints
* Improve frameskip definition in Gym
Bug fixes -
* Fixed loading of checkpoints for agents with more than one network
* Fixed the N Step Q learning agent python3 compatibility
---
README.md | 68 +++---
agents/__init__.py | 3 +
agents/agent.py | 104 ++++----
agents/bc_agent.py | 40 +++
agents/distributional_dqn_agent.py | 60 +++++
agents/human_agent.py | 67 +++++
agents/imitation_agent.py | 70 ++++++
agents/n_step_q_agent.py | 4 +-
agents/policy_optimization_agent.py | 2 +-
architectures/network_wrapper.py | 5 +-
.../tensorflow_components/embedders.py | 81 ++++--
coach.py | 86 +++++--
configurations.py | 52 +++-
docs/docs/algorithms/imitation/bc.md | 25 ++
.../value_optimization/distributional_dqn.md | 33 +++
docs/docs/contributing/add_env.md | 42 +++-
docs/docs/img/algorithms.png | Bin 31579 -> 35829 bytes
docs/docs/usage.md | 133 ++++++++++
docs/mkdocs.yml | 2 +
environments/CarlaSettings.ini | 62 +++++
environments/__init__.py | 5 +-
environments/carla_environment_wrapper.py | 230 ++++++++++++++++++
environments/doom_environment_wrapper.py | 90 +++++--
environments/environment_wrapper.py | 153 ++++++++++--
environments/gym_environment_wrapper.py | 94 +++----
img/algorithms.png | Bin 31579 -> 35829 bytes
img/ant.gif | Bin 7617954 -> 0 bytes
img/carla.gif | Bin 0 -> 3755784 bytes
img/doom.gif | Bin 4932942 -> 0 bytes
img/doom_deathmatch.gif | Bin 0 -> 3117056 bytes
img/minitaur.gif | Bin 3129894 -> 0 bytes
img/montezuma.gif | Bin 0 -> 285044 bytes
install.sh | 14 +-
logger.py | 7 +-
memories/episodic_experience_replay.py | 4 +-
memories/memory.py | 21 +-
presets.py | 103 +++++++-
renderer.py | 85 +++++++
requirements_coach.txt | 1 +
run_test.py | 164 +++++++++++++
utils.py | 63 ++++-
41 files changed, 1713 insertions(+), 260 deletions(-)
create mode 100644 agents/bc_agent.py
create mode 100644 agents/distributional_dqn_agent.py
create mode 100644 agents/human_agent.py
create mode 100644 agents/imitation_agent.py
create mode 100644 docs/docs/algorithms/imitation/bc.md
create mode 100644 docs/docs/algorithms/value_optimization/distributional_dqn.md
create mode 100644 docs/docs/usage.md
create mode 100644 environments/CarlaSettings.ini
create mode 100644 environments/carla_environment_wrapper.py
delete mode 100644 img/ant.gif
create mode 100644 img/carla.gif
delete mode 100644 img/doom.gif
create mode 100644 img/doom_deathmatch.gif
delete mode 100644 img/minitaur.gif
create mode 100644 img/montezuma.gif
create mode 100644 renderer.py
create mode 100644 run_test.py
diff --git a/README.md b/README.md
index e503556..16562e4 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,16 @@ Training an agent to solve an environment is as easy as running:
python3 coach.py -p CartPole_DQN -r
```
-
+
Blog post from the Intel® Nervana™ website can be found [here](https://www.intelnervana.com/reinforcement-learning-coach-intel).
+
+## Documentation
+
+Framework documentation, algorithm description and instructions on how to contribute a new agent/environment can be found [here](http://coach.nervanasys.com).
+
+
## Installation
Note: Coach has only been tested on Ubuntu 16.04 LTS, and with Python 3.5.
@@ -103,6 +109,8 @@ For example:
It is easy to create new presets for different levels or environments by following the same pattern as in presets.py
+More usage examples can be found [here](http://coach.nervanasys.com/usage/index.html).
+
## Running Coach Dashboard (Visualization)
Training an agent to solve an environment can be tricky, at times.
@@ -121,11 +129,6 @@ python3 dashboard.py
-## Documentation
-
-Framework documentation, algoritmic description and instructions on how to contribute a new agent/environment can be found [here](http://coach.nervanasys.com).
-
-
## Parallelizing an Algorithm
Since the introduction of [A3C](https://arxiv.org/abs/1602.01783) in 2016, many algorithms were shown to benefit from running multiple instances in parallel, on many CPU cores. So far, these algorithms include [A3C](https://arxiv.org/abs/1602.01783), [DDPG](https://arxiv.org/pdf/1704.03073.pdf), [PPO](https://arxiv.org/pdf/1707.06347.pdf), and [NAF](https://arxiv.org/pdf/1610.00633.pdf), and this is most probably only the begining.
@@ -150,11 +153,11 @@ python3 coach.py -p Hopper_A3C -n 16
## Supported Environments
-* OpenAI Gym
+* *OpenAI Gym:*
Installed by default by Coach's installer.
-* ViZDoom:
+* *ViZDoom:*
Follow the instructions described in the ViZDoom repository -
@@ -162,13 +165,13 @@ python3 coach.py -p Hopper_A3C -n 16
Additionally, Coach assumes that the environment variable VIZDOOM_ROOT points to the ViZDoom installation directory.
-* Roboschool:
+* *Roboschool:*
Follow the instructions described in the roboschool repository -
https://github.com/openai/roboschool
-* GymExtensions:
+* *GymExtensions:*
Follow the instructions described in the GymExtensions repository -
@@ -176,10 +179,19 @@ python3 coach.py -p Hopper_A3C -n 16
Additionally, add the installation directory to the PYTHONPATH environment variable.
-* PyBullet
+* *PyBullet:*
Follow the instructions described in the [Quick Start Guide](https://docs.google.com/document/d/10sXEhzFRSnvFcl3XxNGhnD4N2SedqwdAvK3dsihxVUA) (basically just - 'pip install pybullet')
+* *CARLA:*
+
+ Download release 0.7 from the CARLA repository -
+
+ https://github.com/carla-simulator/carla/releases
+
+ Create a new CARLA_ROOT environment variable pointing to CARLA's installation directory.
+
+ A simple CARLA settings file (```CarlaSettings.ini```) is supplied with Coach, and is located in the ```environments``` directory.
## Supported Algorithms
@@ -190,24 +202,24 @@ python3 coach.py -p Hopper_A3C -n 16
-* [Deep Q Network (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
-* [Double Deep Q Network (DDQN)](https://arxiv.org/pdf/1509.06461.pdf)
+* [Deep Q Network (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) ([code](agents/dqn_agent.py))
+* [Double Deep Q Network (DDQN)](https://arxiv.org/pdf/1509.06461.pdf) ([code](agents/ddqn_agent.py))
* [Dueling Q Network](https://arxiv.org/abs/1511.06581)
-* [Mixed Monte Carlo (MMC)](https://arxiv.org/abs/1703.01310)
-* [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860)
-* [Categorical Deep Q Network (C51)](https://arxiv.org/abs/1707.06887)
-* [Quantile Regression Deep Q Network (QR-DQN)](https://arxiv.org/pdf/1710.10044v1.pdf)
-* [Bootstrapped Deep Q Network](https://arxiv.org/abs/1602.04621)
-* [N-Step Q Learning](https://arxiv.org/abs/1602.01783) | **Distributed**
-* [Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988)
-* [Normalized Advantage Functions (NAF)](https://arxiv.org/abs/1603.00748.pdf) | **Distributed**
-* [Policy Gradients (PG)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | **Distributed**
-* [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/abs/1602.01783) | **Distributed**
-* [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | **Distributed**
-* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf)
-* [Clipped Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) | **Distributed**
-* [Direct Future Prediction (DFP)](https://arxiv.org/abs/1611.01779) | **Distributed**
-
+* [Mixed Monte Carlo (MMC)](https://arxiv.org/abs/1703.01310) ([code](agents/mmc_agent.py))
+* [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860) ([code](agents/pal_agent.py))
+* [Categorical Deep Q Network (C51)](https://arxiv.org/abs/1707.06887) ([code](agents/categorical_dqn_agent.py))
+* [Quantile Regression Deep Q Network (QR-DQN)](https://arxiv.org/pdf/1710.10044v1.pdf) ([code](agents/qr_dqn_agent.py))
+* [Bootstrapped Deep Q Network](https://arxiv.org/abs/1602.04621) ([code](agents/bootstrapped_dqn_agent.py))
+* [N-Step Q Learning](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](agents/n_step_q_agent.py))
+* [Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988) ([code](agents/nec_agent.py))
+* [Normalized Advantage Functions (NAF)](https://arxiv.org/abs/1603.00748.pdf) | **Distributed** ([code](agents/naf_agent.py))
+* [Policy Gradients (PG)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | **Distributed** ([code](agents/policy_gradients_agent.py))
+* [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](agents/actor_critic_agent.py))
+* [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | **Distributed** ([code](agents/ddpg_agent.py))
+* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) ([code](agents/ppo_agent.py))
+* [Clipped Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) | **Distributed** ([code](agents/clipped_ppo_agent.py))
+* [Direct Future Prediction (DFP)](https://arxiv.org/abs/1611.01779) | **Distributed** ([code](agents/dfp_agent.py))
+* Behavioral Cloning (BC) ([code](agents/bc_agent.py))
diff --git a/agents/__init__.py b/agents/__init__.py
index b1ae8d3..fdbd13e 100644
--- a/agents/__init__.py
+++ b/agents/__init__.py
@@ -16,6 +16,7 @@
from agents.actor_critic_agent import *
from agents.agent import *
+from agents.bc_agent import *
from agents.bootstrapped_dqn_agent import *
from agents.clipped_ppo_agent import *
from agents.ddpg_agent import *
@@ -23,6 +24,8 @@ from agents.ddqn_agent import *
from agents.dfp_agent import *
from agents.dqn_agent import *
from agents.categorical_dqn_agent import *
+from agents.human_agent import *
+from agents.imitation_agent import *
from agents.mmc_agent import *
from agents.n_step_q_agent import *
from agents.naf_agent import *
diff --git a/agents/agent.py b/agents/agent.py
index ed9eabc..a541fa5 100644
--- a/agents/agent.py
+++ b/agents/agent.py
@@ -50,6 +50,7 @@ class Agent(object):
self.task_id = task_id
self.sess = tuning_parameters.sess
self.env = tuning_parameters.env_instance = env
+ self.imitation = False
# i/o dimensions
if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
@@ -61,7 +62,12 @@ class Agent(object):
self.measurements_size = tuning_parameters.env.measurements_size = (self.measurements_size[0] + 1,)
# modules
- self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
+ if tuning_parameters.agent.load_memory_from_file_path:
+ screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
+ .format(tuning_parameters.agent.load_memory_from_file_path))
+ self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path)
+ else:
+ self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
# self.architecture = eval(tuning_parameters.architecture)
self.has_global = replicated_device is not None
@@ -121,11 +127,12 @@ class Agent(object):
def log_to_screen(self, phase):
# log to screen
- if self.current_episode > 0:
- if phase == RunPhase.TEST:
- exploration = self.evaluation_exploration_policy.get_control_param()
- else:
+ if self.current_episode >= 0:
+ if phase == RunPhase.TRAIN:
exploration = self.exploration_policy.get_control_param()
+ else:
+ exploration = self.evaluation_exploration_policy.get_control_param()
+
screen.log_dict(
OrderedDict([
("Worker", self.task_id),
@@ -135,7 +142,7 @@ class Agent(object):
("steps", self.total_steps_counter),
("training iteration", self.training_iteration)
]),
- prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
+ prefix=phase
)
def update_log(self, phase=RunPhase.TRAIN):
@@ -146,7 +153,7 @@ class Agent(object):
# log all the signals to file
logger.set_current_time(self.current_episode)
logger.create_signal_value('Training Iter', self.training_iteration)
- logger.create_signal_value('In Heatup', int(self.in_heatup))
+ logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
logger.create_signal_value('ER #Episodes', self.memory.length())
logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
@@ -197,24 +204,6 @@ class Agent(object):
network.curr_rnn_c_in = network.middleware_embedder.c_init
network.curr_rnn_h_in = network.middleware_embedder.h_init
- def stack_observation(self, curr_stack, observation):
- """
- Adds a new observation to an existing stack of observations from previous time-steps.
- :param curr_stack: The current observations stack.
- :param observation: The new observation
- :return: The updated observation stack
- """
-
- if curr_stack == []:
- # starting an episode
- curr_stack = np.vstack(np.expand_dims([observation] * self.tp.env.observation_stack_size, 0))
- curr_stack = self.switch_axes_order(curr_stack, from_type='channels_first', to_type='channels_last')
- else:
- curr_stack = np.append(curr_stack, np.expand_dims(np.squeeze(observation), axis=-1), axis=-1)
- curr_stack = np.delete(curr_stack, 0, -1)
-
- return curr_stack
-
def preprocess_observation(self, observation):
"""
Preprocesses the given observation.
@@ -335,26 +324,6 @@ class Agent(object):
reward = max(reward, self.tp.env.reward_clipping_min)
return reward
- def switch_axes_order(self, observation, from_type='channels_first', to_type='channels_last'):
- """
- transpose an observation axes from channels_first to channels_last or vice versa
- :param observation: a numpy array
- :param from_type: can be 'channels_first' or 'channels_last'
- :param to_type: can be 'channels_first' or 'channels_last'
- :return: a new observation with the requested axes order
- """
- if from_type == to_type or len(observation.shape) == 1:
- return observation
- assert 2 <= len(observation.shape) <= 3, 'num axes of an observation must be 2 for a vector or 3 for an image'
- assert type(observation) == np.ndarray, 'observation must be a numpy array'
- if len(observation.shape) == 3:
- if from_type == 'channels_first' and to_type == 'channels_last':
- return np.transpose(observation, (1, 2, 0))
- elif from_type == 'channels_last' and to_type == 'channels_first':
- return np.transpose(observation, (2, 0, 1))
- else:
- return np.transpose(observation, (1, 0))
-
def act(self, phase=RunPhase.TRAIN):
"""
Take one step in the environment according to the network prediction and store the transition in memory
@@ -370,7 +339,7 @@ class Agent(object):
is_first_transition_in_episode = (self.curr_state == [])
if is_first_transition_in_episode:
observation = self.preprocess_observation(self.env.observation)
- observation = self.stack_observation([], observation)
+ observation = stack_observation([], observation, self.tp.env.observation_stack_size)
self.curr_state = {'observation': observation}
if self.tp.agent.use_measurements:
@@ -378,7 +347,7 @@ class Agent(object):
if self.tp.agent.use_accumulated_reward_as_measurement:
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
- if self.in_heatup: # we do not have a stacked curr_state yet
+ if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
action = self.env.get_random_action()
else:
action, action_info = self.choose_action(self.curr_state, phase=phase)
@@ -394,11 +363,11 @@ class Agent(object):
observation = self.preprocess_observation(result['observation'])
# plot action values online
- if self.tp.visualization.plot_action_values_online and not self.in_heatup:
+ if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
self.plot_action_values_online()
# initialize the next state
- observation = self.stack_observation(self.curr_state['observation'], observation)
+ observation = stack_observation(self.curr_state['observation'], observation, self.tp.env.observation_stack_size)
next_state = {'observation': observation}
if self.tp.agent.use_measurements and 'measurements' in result.keys():
@@ -407,7 +376,7 @@ class Agent(object):
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
# store the transition only if we are training
- if phase == RunPhase.TRAIN:
+ if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
for key in action_info.keys():
transition.info[key] = action_info[key]
@@ -427,7 +396,7 @@ class Agent(object):
self.update_log(phase=phase)
self.log_to_screen(phase=phase)
- if phase == RunPhase.TRAIN:
+ if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
self.reset_game()
self.current_episode += 1
@@ -462,11 +431,12 @@ class Agent(object):
for network in self.networks:
network.sync()
- if self.tp.visualization.dump_gifs and self.total_reward_in_current_episode > max_reward_achieved:
+ if self.total_reward_in_current_episode > max_reward_achieved:
max_reward_achieved = self.total_reward_in_current_episode
frame_skipping = int(5/self.tp.env.frame_skip)
- logger.create_gif(self.last_episode_images[::frame_skipping],
- name='score-{}'.format(max_reward_achieved), fps=10)
+ if self.tp.visualization.dump_gifs:
+ logger.create_gif(self.last_episode_images[::frame_skipping],
+ name='score-{}'.format(max_reward_achieved), fps=10)
average_evaluation_reward += self.total_reward_in_current_episode
self.reset_game()
@@ -496,7 +466,7 @@ class Agent(object):
screen.log_title("Starting heatup {}".format(self.task_id))
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
- self.act()
+ self.act(phase=RunPhase.HEATUP)
# training phase
self.in_heatup = False
@@ -509,7 +479,12 @@ class Agent(object):
# evaluate
evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \
(self.current_episode % self.tp.evaluate_every_x_episodes == 0)
+ evaluate_agent = evaluate_agent or \
+ (self.imitation and self.training_iteration > 0 and
+ self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0)
+
if evaluate_agent:
+ self.env.reset()
self.last_episode_evaluation_ran = self.current_episode
self.evaluate(self.tp.evaluation_episodes)
@@ -522,14 +497,15 @@ class Agent(object):
self.save_model(model_snapshots_periods_passed)
# play and record in replay buffer
- if self.tp.agent.step_until_collecting_full_episodes:
- step = 0
- while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0:
- self.act()
- step += 1
- else:
- for step in range(self.tp.agent.num_consecutive_playing_steps):
- self.act()
+ if self.tp.agent.collect_new_data:
+ if self.tp.agent.step_until_collecting_full_episodes:
+ step = 0
+ while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0:
+ self.act()
+ step += 1
+ else:
+ for step in range(self.tp.agent.num_consecutive_playing_steps):
+ self.act()
# train
if self.tp.train:
@@ -537,6 +513,8 @@ class Agent(object):
loss = self.train()
self.loss.add_sample(loss)
self.training_iteration += 1
+ if self.imitation:
+ self.log_to_screen(RunPhase.TRAIN)
self.post_training_commands()
def save_model(self, model_id):
diff --git a/agents/bc_agent.py b/agents/bc_agent.py
new file mode 100644
index 0000000..e065731
--- /dev/null
+++ b/agents/bc_agent.py
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.imitation_agent import *
+
+
+# Behavioral Cloning Agent
+class BCAgent(ImitationAgent):
+ def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+ ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+ def learn_from_batch(self, batch):
+ current_states, _, actions, _, _, _ = self.extract_batch(batch)
+
+ # create the inputs for the network
+ input = current_states
+
+ # the targets for the network are the actions since this is supervised learning
+ if self.env.discrete_controls:
+ targets = np.eye(self.env.action_space_size)[[actions]]
+ else:
+ targets = actions
+
+ result = self.main_network.train_and_sync_networks(input, targets)
+ total_loss = result[0]
+
+ return total_loss
diff --git a/agents/distributional_dqn_agent.py b/agents/distributional_dqn_agent.py
new file mode 100644
index 0000000..d7c0088
--- /dev/null
+++ b/agents/distributional_dqn_agent.py
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.value_optimization_agent import *
+
+
+# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
+class DistributionalDQNAgent(ValueOptimizationAgent):
+ def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+ ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+ self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
+
+ # prediction's format is (batch,actions,atoms)
+ def get_q_values(self, prediction):
+ return np.dot(prediction, self.z_values)
+
+ def learn_from_batch(self, batch):
+ current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
+
+ # for the action we actually took, the error is calculated by the atoms distribution
+ # for all other actions, the error is 0
+ distributed_q_st_plus_1 = self.main_network.target_network.predict(next_states)
+ # initialize with the current prediction so that we will
+ TD_targets = self.main_network.online_network.predict(current_states)
+
+ # only update the action that we have actually done in this transition
+ target_actions = np.argmax(self.get_q_values(distributed_q_st_plus_1), axis=1)
+ m = np.zeros((self.tp.batch_size, self.z_values.size))
+
+ batches = np.arange(self.tp.batch_size)
+ for j in range(self.z_values.size):
+ tzj = np.fmax(np.fmin(rewards + (1.0 - game_overs) * self.tp.agent.discount * self.z_values[j],
+ self.z_values[self.z_values.size - 1]),
+ self.z_values[0])
+ bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
+ u = (np.ceil(bj)).astype(int)
+ l = (np.floor(bj)).astype(int)
+ m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
+ m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
+ # total_loss = cross entropy between actual result above and predicted result for the given action
+ TD_targets[batches, actions] = m
+
+ result = self.main_network.train_and_sync_networks(current_states, TD_targets)
+ total_loss = result[0]
+
+ return total_loss
+
diff --git a/agents/human_agent.py b/agents/human_agent.py
new file mode 100644
index 0000000..c75c2a2
--- /dev/null
+++ b/agents/human_agent.py
@@ -0,0 +1,67 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.agent import *
+import pygame
+
+
+class HumanAgent(Agent):
+ def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+ Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+
+ self.clock = pygame.time.Clock()
+ self.max_fps = int(self.tp.visualization.max_fps_for_human_control)
+
+ screen.log_title("Human Control Mode")
+ available_keys = self.env.get_available_keys()
+ if available_keys:
+ screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
+ screen.log("")
+ for action, key in self.env.get_available_keys():
+ screen.log("\t- {}: {}".format(action, key))
+ screen.separator()
+
+ def train(self):
+ return 0
+
+ def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+ action = self.env.get_action_from_user()
+
+ # keep constant fps
+ self.clock.tick(self.max_fps)
+
+ if not self.env.renderer.is_open:
+ self.save_replay_buffer_and_exit()
+
+ return action, {"action_value": 0}
+
+ def save_replay_buffer_and_exit(self):
+ replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p')
+ self.memory.tp = None
+ to_pickle(self.memory, replay_buffer_path)
+ screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
+ exit()
+
+ def log_to_screen(self, phase):
+ # log to screen
+ screen.log_dict(
+ OrderedDict([
+ ("Episode", self.current_episode),
+ ("total reward", self.total_reward_in_current_episode),
+ ("steps", self.total_steps_counter)
+ ]),
+ prefix="Recording"
+ )
diff --git a/agents/imitation_agent.py b/agents/imitation_agent.py
new file mode 100644
index 0000000..f7f5e06
--- /dev/null
+++ b/agents/imitation_agent.py
@@ -0,0 +1,70 @@
+#
+# Copyright (c) 2017 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from agents.agent import *
+
+
+# Imitation Agent
+class ImitationAgent(Agent):
+ def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
+ Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
+ self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
+ self.replicated_device, self.worker_device)
+ self.networks.append(self.main_network)
+ self.imitation = True
+
+ def extract_action_values(self, prediction):
+ return prediction.squeeze()
+
+ def choose_action(self, curr_state, phase=RunPhase.TRAIN):
+ # convert to batch so we can run it through the network
+ observation = np.expand_dims(np.array(curr_state['observation']), 0)
+ if self.tp.agent.use_measurements:
+ measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
+ prediction = self.main_network.online_network.predict([observation, measurements])
+ else:
+ prediction = self.main_network.online_network.predict(observation)
+
+ # get action values and extract the best action from it
+ action_values = self.extract_action_values(prediction)
+ if self.env.discrete_controls:
+ # DISCRETE
+ # action = np.argmax(action_values)
+ action = self.evaluation_exploration_policy.get_action(action_values)
+ action_value = {"action_probability": action_values[action]}
+ else:
+ # CONTINUOUS
+ action = action_values
+ action_value = {}
+
+ return action, action_value
+
+ def log_to_screen(self, phase):
+ # log to screen
+ if phase == RunPhase.TRAIN:
+ # for the training phase - we log during the episode to visualize the progress in training
+ screen.log_dict(
+ OrderedDict([
+ ("Worker", self.task_id),
+ ("Episode", self.current_episode),
+ ("Loss", self.loss.values[-1]),
+ ("Training iteration", self.training_iteration)
+ ]),
+ prefix="Training"
+ )
+ else:
+ # for the evaluation phase - logging as in regular RL
+ Agent.log_to_screen(self, phase)
diff --git a/agents/n_step_q_agent.py b/agents/n_step_q_agent.py
index 0746523..3b464a8 100644
--- a/agents/n_step_q_agent.py
+++ b/agents/n_step_q_agent.py
@@ -45,7 +45,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
# 1-Step Q learning
q_st_plus_1 = self.main_network.target_network.predict(next_states)
- for i in reversed(xrange(num_transitions)):
+ for i in reversed(range(num_transitions)):
state_value_head_targets[i][actions[i]] = \
rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0)
@@ -56,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
else:
R = np.max(self.main_network.target_network.predict(np.expand_dims(next_states[-1], 0)))
- for i in reversed(xrange(num_transitions)):
+ for i in reversed(range(num_transitions)):
R = rewards[i] + self.tp.agent.discount * R
state_value_head_targets[i][actions[i]] = R
diff --git a/agents/policy_optimization_agent.py b/agents/policy_optimization_agent.py
index c64dbab..07aac6a 100644
--- a/agents/policy_optimization_agent.py
+++ b/agents/policy_optimization_agent.py
@@ -58,7 +58,7 @@ class PolicyOptimizationAgent(Agent):
("steps", self.total_steps_counter),
("training iteration", self.training_iteration)
]),
- prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
+ prefix=phase
)
def update_episode_statistics(self, episode):
diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py
index a034485..bbe6c59 100644
--- a/architectures/network_wrapper.py
+++ b/architectures/network_wrapper.py
@@ -75,11 +75,14 @@ class NetworkWrapper(object):
network_is_local=True)
if not self.tp.distributed and self.tp.framework == Frameworks.TensorFlow:
- self.model_saver = tf.train.Saver()
+ variables_to_restore = tf.global_variables()
+ variables_to_restore = [v for v in variables_to_restore if '/online' in v.name]
+ self.model_saver = tf.train.Saver(variables_to_restore)
if self.tp.sess and self.tp.checkpoint_restore_dir:
checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir)
screen.log_title("Loading checkpoint: {}".format(checkpoint))
self.model_saver.restore(self.tp.sess, checkpoint)
+ self.update_target_network()
def sync(self):
"""
diff --git a/architectures/tensorflow_components/embedders.py b/architectures/tensorflow_components/embedders.py
index 2b3212c..6b6acd2 100644
--- a/architectures/tensorflow_components/embedders.py
+++ b/architectures/tensorflow_components/embedders.py
@@ -15,15 +15,18 @@
#
import tensorflow as tf
+from configurations import EmbedderComplexity
class InputEmbedder(object):
- def __init__(self, input_size, activation_function=tf.nn.relu, name="embedder"):
+ def __init__(self, input_size, activation_function=tf.nn.relu,
+ embedder_complexity=EmbedderComplexity.Shallow, name="embedder"):
self.name = name
self.input_size = input_size
self.activation_function = activation_function
self.input = None
self.output = None
+ self.embedder_complexity = embedder_complexity
def __call__(self, prev_input_placeholder=None):
with tf.variable_scope(self.get_name()):
@@ -43,31 +46,77 @@ class InputEmbedder(object):
class ImageEmbedder(InputEmbedder):
- def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu, name="embedder"):
- InputEmbedder.__init__(self, input_size, activation_function, name)
+ def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu,
+ embedder_complexity=EmbedderComplexity.Shallow, name="embedder"):
+ InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name)
self.input_rescaler = input_rescaler
def _build_module(self):
# image observation
rescaled_observation_stack = self.input / self.input_rescaler
- self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack,
- filters=32, kernel_size=(8, 8), strides=(4, 4),
- activation=self.activation_function, data_format='channels_last')
- self.observation_conv2 = tf.layers.conv2d(self.observation_conv1,
- filters=64, kernel_size=(4, 4), strides=(2, 2),
- activation=self.activation_function, data_format='channels_last')
- self.observation_conv3 = tf.layers.conv2d(self.observation_conv2,
- filters=64, kernel_size=(3, 3), strides=(1, 1),
- activation=self.activation_function, data_format='channels_last')
- self.output = tf.contrib.layers.flatten(self.observation_conv3)
+ if self.embedder_complexity == EmbedderComplexity.Shallow:
+ # same embedder as used in the original DQN paper
+ self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack,
+ filters=32, kernel_size=(8, 8), strides=(4, 4),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv2 = tf.layers.conv2d(self.observation_conv1,
+ filters=64, kernel_size=(4, 4), strides=(2, 2),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv3 = tf.layers.conv2d(self.observation_conv2,
+ filters=64, kernel_size=(3, 3), strides=(1, 1),
+ activation=self.activation_function, data_format='channels_last')
+
+ self.output = tf.contrib.layers.flatten(self.observation_conv3)
+
+ elif self.embedder_complexity == EmbedderComplexity.Deep:
+ # the embedder used in the CARLA papers
+ self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack,
+ filters=32, kernel_size=(5, 5), strides=(2, 2),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv2 = tf.layers.conv2d(self.observation_conv1,
+ filters=32, kernel_size=(3, 3), strides=(1, 1),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv3 = tf.layers.conv2d(self.observation_conv2,
+ filters=64, kernel_size=(3, 3), strides=(2, 2),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv4 = tf.layers.conv2d(self.observation_conv3,
+ filters=64, kernel_size=(3, 3), strides=(1, 1),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv5 = tf.layers.conv2d(self.observation_conv4,
+ filters=128, kernel_size=(3, 3), strides=(2, 2),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv6 = tf.layers.conv2d(self.observation_conv5,
+ filters=128, kernel_size=(3, 3), strides=(1, 1),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv7 = tf.layers.conv2d(self.observation_conv6,
+ filters=256, kernel_size=(3, 3), strides=(2, 2),
+ activation=self.activation_function, data_format='channels_last')
+ self.observation_conv8 = tf.layers.conv2d(self.observation_conv7,
+ filters=256, kernel_size=(3, 3), strides=(1, 1),
+ activation=self.activation_function, data_format='channels_last')
+
+ self.output = tf.contrib.layers.flatten(self.observation_conv8)
+ else:
+ raise ValueError("The defined embedder complexity value is invalid")
class VectorEmbedder(InputEmbedder):
- def __init__(self, input_size, activation_function=tf.nn.relu, name="embedder"):
- InputEmbedder.__init__(self, input_size, activation_function, name)
+ def __init__(self, input_size, activation_function=tf.nn.relu,
+ embedder_complexity=EmbedderComplexity.Shallow, name="embedder"):
+ InputEmbedder.__init__(self, input_size, activation_function, embedder_complexity, name)
def _build_module(self):
# vector observation
input_layer = tf.contrib.layers.flatten(self.input)
- self.output = tf.layers.dense(input_layer, 256, activation=self.activation_function)
+
+ if self.embedder_complexity == EmbedderComplexity.Shallow:
+ self.output = tf.layers.dense(input_layer, 256, activation=self.activation_function)
+
+ elif self.embedder_complexity == EmbedderComplexity.Deep:
+ # the embedder used in the CARLA papers
+ self.observation_fc1 = tf.layers.dense(input_layer, 128, activation=self.activation_function)
+ self.observation_fc2 = tf.layers.dense(self.observation_fc1, 128, activation=self.activation_function)
+ self.output = tf.layers.dense(self.observation_fc2, 128, activation=self.activation_function)
+ else:
+ raise ValueError("The defined embedder complexity value is invalid")
diff --git a/coach.py b/coach.py
index ffddbc9..45b7382 100644
--- a/coach.py
+++ b/coach.py
@@ -37,8 +37,29 @@ time_started = datetime.datetime.now()
cur_time = time_started.time()
cur_date = time_started.date()
-def get_experiment_path(general_experiments_path):
- if not os.path.exists(general_experiments_path):
+
+def get_experiment_name(initial_experiment_name=''):
+ match = None
+ while match is None:
+ if initial_experiment_name == '':
+ experiment_name = screen.ask_input("Please enter an experiment name: ")
+ else:
+ experiment_name = initial_experiment_name
+
+ experiment_name = experiment_name.replace(" ", "_")
+ match = re.match("^$|^[\w -/]{1,100}$", experiment_name)
+
+ if match is None:
+ screen.error('Experiment name must be composed only of alphanumeric letters, '
+ 'underscores and dashes and should not be longer than 100 characters.')
+
+ return match.group(0)
+
+
+def get_experiment_path(experiment_name, create_path=True):
+ general_experiments_path = os.path.join('./experiments/', experiment_name)
+
+ if not os.path.exists(general_experiments_path) and create_path:
os.makedirs(general_experiments_path)
experiment_path = os.path.join(general_experiments_path, '{}_{}_{}-{}_{}'
.format(logger.two_digits(cur_date.day), logger.two_digits(cur_date.month),
@@ -52,7 +73,8 @@ def get_experiment_path(general_experiments_path):
cur_time.minute, i))
i += 1
else:
- os.makedirs(experiment_path)
+ if create_path:
+ os.makedirs(experiment_path)
return experiment_path
@@ -96,55 +118,54 @@ def check_input_and_fill_run_dict(parser):
num_workers = int(re.match("^\d+$", args.num_workers).group(0))
except ValueError:
screen.error("Parameter num_workers should be an integer.")
- exit(1)
preset_names = list_all_classes_in_module(presets)
if args.preset is not None and args.preset not in preset_names:
screen.error("A non-existing preset was selected. ")
- exit(1)
if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
screen.error("The requested checkpoint folder to load from does not exist. ")
- exit(1)
if args.save_model_sec is not None:
try:
args.save_model_sec = int(args.save_model_sec)
except ValueError:
screen.error("Parameter save_model_sec should be an integer.")
- exit(1)
if args.preset is None and (args.agent_type is None or args.environment_type is None
- or args.exploration_policy_type is None):
+ or args.exploration_policy_type is None) and not args.play:
screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,'
' environment_type and exploration_policy_type to assemble a preset. '
'\nAt least one of these parameters was not given.')
- exit(1)
+ elif args.preset is None and args.play and args.environment_type is None:
+ screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,'
+ ' the user is expected to input the desired environment_type and level.'
+ '\nAt least one of these parameters was not given.')
+ elif args.preset is None and args.play and args.environment_type:
+ args.agent_type = 'Human'
+ args.exploration_policy_type = 'ExplorationParameters'
- experiment_name = args.experiment_name
+ # get experiment name and path
+ experiment_name = get_experiment_name(args.experiment_name)
+ experiment_path = get_experiment_path(experiment_name)
- if args.experiment_name == '':
- experiment_name = screen.ask_input("Please enter an experiment name: ")
-
- experiment_name = experiment_name.replace(" ", "_")
- match = re.match("^$|^\w{1,100}$", experiment_name)
-
- if match is None:
- screen.error('Experiment name must be composed only of alphanumeric letters and underscores and should not be '
- 'longer than 100 characters.')
- exit(1)
- experiment_path = os.path.join('./experiments/', match.group(0))
- experiment_path = get_experiment_path(experiment_path)
+ if args.play and num_workers > 1:
+ screen.warning("Playing the game as a human is only available with a single worker. "
+ "The number of workers will be reduced to 1")
+ num_workers = 1
# fill run_dict
run_dict = dict()
run_dict['agent_type'] = args.agent_type
run_dict['environment_type'] = args.environment_type
run_dict['exploration_policy_type'] = args.exploration_policy_type
+ run_dict['level'] = args.level
run_dict['preset'] = args.preset
run_dict['custom_parameter'] = args.custom_parameter
run_dict['experiment_path'] = experiment_path
run_dict['framework'] = Frameworks().get(args.framework)
+ run_dict['play'] = args.play
+ run_dict['evaluate'] = args.evaluate# or args.play
# multi-threading parameters
run_dict['num_threads'] = num_workers
@@ -197,6 +218,14 @@ if __name__ == "__main__":
help="(int) Number of workers for multi-process based agents, e.g. A3C",
default='1',
type=str)
+ parser.add_argument('--play',
+ help="(flag) Play as a human by controlling the game with the keyboard. "
+ "This option will save a replay buffer with the game play.",
+ action='store_true')
+ parser.add_argument('--evaluate',
+ help="(flag) Run evaluation only. This is a convenient way to disable "
+ "training in order to evaluate an existing checkpoint.",
+ action='store_true')
parser.add_argument('-v', '--verbose',
help="(flag) Don't suppress TensorFlow debug prints.",
action='store_true')
@@ -230,6 +259,12 @@ if __name__ == "__main__":
,
default=None,
type=str)
+ parser.add_argument('-lvl', '--level',
+ help="(string) Choose the level that will be played in the environment that was selected."
+ "This value will override the level parameter in the environment class."
+ ,
+ default=None,
+ type=str)
parser.add_argument('-cp', '--custom_parameter',
help="(string) Semicolon separated parameters used to override specific parameters on top of"
" the selected preset (or on top of the command-line assembled one). "
@@ -259,7 +294,12 @@ if __name__ == "__main__":
tuning_parameters.task_index = 0
env_instance = create_environment(tuning_parameters)
agent = eval(tuning_parameters.agent.type + '(env_instance, tuning_parameters)')
- agent.improve()
+
+ # Start the training or evaluation
+ if tuning_parameters.evaluate:
+ agent.evaluate(sys.maxsize, keep_networks_synced=True) # evaluate forever
+ else:
+ agent.improve()
# Multi-threaded runs
else:
diff --git a/configurations.py b/configurations.py
index b7f9953..8480c19 100644
--- a/configurations.py
+++ b/configurations.py
@@ -32,6 +32,11 @@ class InputTypes(object):
TimedObservation = 5
+class EmbedderComplexity(object):
+ Shallow = 1
+ Deep = 2
+
+
class OutputTypes(object):
Q = 1
DuelingQ = 2
@@ -60,6 +65,7 @@ class AgentParameters(object):
middleware_type = MiddlewareTypes.FC
loss_weights = [1.0]
stop_gradients_from_head = [False]
+ embedder_complexity = EmbedderComplexity.Shallow
num_output_head_copies = 1
use_measurements = False
use_accumulated_reward_as_measurement = False
@@ -90,6 +96,8 @@ class AgentParameters(object):
step_until_collecting_full_episodes = False
targets_horizon = 'N-Step'
replace_mse_with_huber_loss = False
+ load_memory_from_file_path = None
+ collect_new_data = True
# PPO related params
target_kl_divergence = 0.01
@@ -132,6 +140,7 @@ class EnvironmentParameters(object):
reward_scaling = 1.0
reward_clipping_min = None
reward_clipping_max = None
+ human_control = False
class ExplorationParameters(object):
@@ -188,6 +197,7 @@ class GeneralParameters(object):
kl_divergence_constraint = 100000
num_training_iterations = 10000000000
num_heatup_steps = 1000
+ heatup_using_network_decisions = False
batch_size = 32
save_model_sec = None
save_model_dir = None
@@ -197,6 +207,7 @@ class GeneralParameters(object):
learning_rate_decay_steps = 0
evaluation_episodes = 5
evaluate_every_x_episodes = 1000000
+ evaluate_every_x_training_iterations = 0
rescaling_interpolation_type = 'bilinear'
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
@@ -224,6 +235,7 @@ class VisualizationParameters(object):
dump_signals_to_csv_every_x_episodes = 10
render = False
dump_gifs = True
+ max_fps_for_human_control = 10
class Roboschool(EnvironmentParameters):
@@ -252,7 +264,7 @@ class Bullet(EnvironmentParameters):
class Atari(EnvironmentParameters):
type = 'Gym'
- frame_skip = 1
+ frame_skip = 4
observation_stack_size = 4
desired_observation_height = 84
desired_observation_width = 84
@@ -268,6 +280,31 @@ class Doom(EnvironmentParameters):
desired_observation_width = 76
+class Carla(EnvironmentParameters):
+ type = 'Carla'
+ frame_skip = 1
+ observation_stack_size = 4
+ desired_observation_height = 128
+ desired_observation_width = 180
+ normalize_observation = False
+ server_height = 256
+ server_width = 360
+ config = 'environments/CarlaSettings.ini'
+ level = 'town1'
+ verbose = True
+ stereo = False
+ semantic_segmentation = False
+ depth = False
+ episode_max_time = 100000 # miliseconds for each episode
+ continuous_to_bool_threshold = 0.5
+ allow_braking = False
+
+
+class Human(AgentParameters):
+ type = 'HumanAgent'
+ num_episodes_in_experience_replay = 10000000
+
+
class NStepQ(AgentParameters):
type = 'NStepQAgent'
input_types = [InputTypes.Observation]
@@ -299,10 +336,12 @@ class DQN(AgentParameters):
class DDQN(DQN):
type = 'DDQNAgent'
+
class DuelingDQN(DQN):
type = 'DQNAgent'
output_types = [OutputTypes.DuelingQ]
+
class BootstrappedDQN(DQN):
type = 'BootstrappedDQNAgent'
num_output_head_copies = 10
@@ -314,6 +353,7 @@ class CategoricalDQN(DQN):
v_min = -10.0
v_max = 10.0
atoms = 51
+ neon_support = False
class QuantileRegressionDQN(DQN):
@@ -452,6 +492,7 @@ class ClippedPPO(AgentParameters):
step_until_collecting_full_episodes = True
beta_entropy = 0.01
+
class DFP(AgentParameters):
type = 'DFPAgent'
input_types = [InputTypes.Observation, InputTypes.Measurements, InputTypes.GoalVector]
@@ -485,6 +526,15 @@ class PAL(AgentParameters):
neon_support = True
+class BC(AgentParameters):
+ type = 'BCAgent'
+ input_types = [InputTypes.Observation]
+ output_types = [OutputTypes.Q]
+ loss_weights = [1.0]
+ collect_new_data = False
+ evaluate_every_x_training_iterations = 50000
+
+
class EGreedyExploration(ExplorationParameters):
policy = 'EGreedy'
initial_epsilon = 0.5
diff --git a/docs/docs/algorithms/imitation/bc.md b/docs/docs/algorithms/imitation/bc.md
new file mode 100644
index 0000000..84e477a
--- /dev/null
+++ b/docs/docs/algorithms/imitation/bc.md
@@ -0,0 +1,25 @@
+# Behavioral Cloning
+
+**Actions space:** Discrete|Continuous
+
+## Network Structure
+
+
+
+
+
+
+
+
+
+
#84%M1{vFfa
zclK{=9L@9xqIy!Gn<_Wr$34t;4tu*ry
gq)pO6B(8qnk-vT9$LbPj9%vig3@+tt1EuIQ^@H+V$0
z={Y09yP^H+6Li_!)d+NJDO&yw0sEI}6JuBAF%B_}AFTaj${&7W@{-eW&I-!KMHIkCUubBDN#;lMscuy!Eo!rkTO)5FJde0*W
z8O!I;|8)xZ#Hel^PO-Ib@Rwu0cWmf2HDpe)Z>bevRM`Itnn1Lrl_M>Tj(o_%wSwd0y=hpidX4QN0~$*Hvzj0B~G$wwH&S
z=yy#fl9v1BC8KlCS6QiZ<0j898M!YV5jQn7EHRtIbaZsl_TjnZPU~8H!g-p7sZMkp
z;c~Z=1u{e$xpc*HTaRgr{QUh1OdEQ_s`4(?x8A0o^*OU3t^LK@4(hjV5A(M_-d~?c
zEvHYa(R$N1Q{d~>EX%IIBJ#5NXVjQUbcZSf{knS8-~gw#u_Zf-aglBH?D^S{A39pO
zQudiS&TTZMR^ocZTp(?c(Z*zrR}IxIK#7e
L@qo;j
zjU`?*GpRa3pc_}F)Ty56i5DU@{G~Bhw<6isV!FlIz=&!FIa+ER%}KvwoU;ebc_&~o
zH03eQVwA|z!I#L4F}tccoe`32IbPmBYQzkH=Ud|flvU7qmj1hZps)U%n4;iCMEUD#
zcNaI;c0lt}s@G0}63T^OY1iT6T{`2Fjn7AXT(y{U29K#825m!8h=wKJI*@Tpgqp@U
zh2Al4Gg^;h$0JA0#RRbafD(DP6?^9bzy0cxyL+|{9mWWyKA*aniQZSG{YKf?YO70T
z?q5
hE@13V?F>S_tu->UP3Gse4z&&q2Vncj!T#iRPW)UK$ULp
z8p`w
zgS_Vr*ZLd>Mwt>ISo&`KH+%ofyQg^xm{Gbm@*{mp;u)P(@QU$wsAUoyEL
zCD^NZ(8WkT=~6Uvaw;aJ@319OIhSx|Z|J6`sk7Uj_Mm+i?M58W$5|3qc_eUlY7dN#
z$l*@