diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..b2ed5e5 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include *.txt +include rl_coach/environments/CarlaSettings.ini +include rl_coach/dashboard_components/spinner.css diff --git a/README.md b/README.md index 84291a8..a93f7a9 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Coach [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/NervanaSystems/coach/blob/master/LICENSE) -[![Docs](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](http://NervanaSystems.github.io/coach/) +[![Docs](https://media.readthedocs.org/static/projects/badges/passing-flat.svg)](https://nervanasystems.github.io/coach/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1134898.svg)](https://doi.org/10.5281/zenodo.1134898) -## Overview +

Coach Logo

Coach is a python reinforcement learning research framework containing implementation of many state-of-the-art algorithms. @@ -36,7 +36,6 @@ Contacting the Coach development team is also possible through the email [coach@ * [Usage](#usage) + [Running Coach](#running-coach) + [Running Coach Dashboard (Visualization)](#running-coach-dashboard-visualization) - + [Parallelizing an Algorithm](#parallelizing-an-algorithm) * [Supported Environments](#supported-environments) * [Supported Algorithms](#supported-algorithms) * [Citation](#citation) @@ -44,56 +43,69 @@ Contacting the Coach development team is also possible through the email [coach@ ## Documentation -Framework documentation, algorithm description and instructions on how to contribute a new agent/environment can be found [here](http://NervanaSystems.github.io/coach/). +Framework documentation, algorithm description and instructions on how to contribute a new agent/environment can be found [here](https://nervanasystems.github.io/coach/). ## Installation Note: Coach has only been tested on Ubuntu 16.04 LTS, and with Python 3.5. -### Coach Installer +For some information on installing on Ubuntu 17.10 with Python 3.6.3, please refer to the following issue: https://github.com/NervanaSystems/coach/issues/54 -Coach's installer will setup all the basics needed to get the user going with running Coach on top of [OpenAI Gym](https://github.com/openai/gym) environments. This can be done by running the following command and then following the on-screen printed instructions: +In order to install coach, there are a few prerequisites required. This will setup all the basics needed to get the user going with running Coach on top of [OpenAI Gym](https://github.com/openai/gym) environments: -```bash -./install.sh +``` +# General +sudo -E apt-get install python3-pip cmake zlib1g-dev python3-tk python-opencv -y + +# Boost libraries +sudo -E apt-get install libboost-all-dev -y + +# Scipy requirements +sudo -E apt-get install libblas-dev liblapack-dev libatlas-base-dev gfortran -y + +# PyGame +sudo -E apt-get install libsdl-dev libsdl-image1.2-dev libsdl-mixer1.2-dev libsdl-ttf2.0-dev +libsmpeg-dev libportmidi-dev libavformat-dev libswscale-dev -y + +# Dashboard +sudo -E apt-get install dpkg-dev build-essential python3.5-dev libjpeg-dev libtiff-dev libsdl1.2-dev libnotify-dev +freeglut3 freeglut3-dev libsm-dev libgtk2.0-dev libgtk-3-dev libwebkitgtk-dev libgtk-3-dev libwebkitgtk-3.0-dev +libgstreamer-plugins-base1.0-dev -y + +# Gym +sudo -E apt-get install libav-tools libsdl2-dev swig cmake -y ``` -Coach creates a virtual environment and installs in it to avoid changes to the user's system. +We recommend installing coach in a virtualenv: -In order to activate and deactivate Coach's virtual environment: - -```bash -source coach_env/bin/activate +``` +sudo -E pip3 install virtualenv +virtualenv -p python3 coach_env +. coach_env/bin/activate ``` -```bash -deactivate +Finally, install coach using pip: ``` +pip3 install rl_coach +``` + +Or alternatively, for a development environment, install coach from the cloned repository: +``` +cd coach +pip3 install -e . +``` + +If a GPU is present, Coach's pip package will install tensorflow-gpu, by default. If a GPU is not present, an [Intel-Optimized TensorFlow](https://software.intel.com/en-us/articles/intel-optimized-tensorflow-wheel-now-available), will be installed. In addition to OpenAI Gym, several other environments were tested and are supported. Please follow the instructions in the Supported Environments section below in order to install more environments. -### TensorFlow GPU Support - -Coach's installer installs [Intel-Optimized TensorFlow](https://software.intel.com/en-us/articles/intel-optimized-tensorflow-wheel-now-available), which does not support GPU, by default. In order to have Coach running with GPU, a GPU supported TensorFlow version must be installed. This can be done by overriding the TensorFlow version: - -```bash -pip3 install tensorflow-gpu -``` - ## Usage ### Running Coach -Coach supports both TensorFlow and neon deep learning frameworks. - -Switching between TensorFlow and neon backends is possible by using the `-f` flag. - -Using TensorFlow (default): `-f tensorflow` - -Using neon: `-f neon` - -There are several available presets in presets.py. +To allow reproducing results in Coach, we defined a mechanism called _preset_. +There are several available presets under the `presets` directory. To list all the available presets use the `-l` flag. To run a preset, use: @@ -103,39 +115,44 @@ python3 coach.py -r -p ``` For example: -1. CartPole environment using Policy Gradients: +* CartPole environment using Policy Gradients (PG): ```bash python3 coach.py -r -p CartPole_PG ``` - -2. Pendulum using Clipped PPO: + +* Basic level of Doom using Dueling network and Double DQN (DDQN) algorithm: ```bash - python3 coach.py -r -p Pendulum_ClippedPPO -n 8 + python3 coach.py -r -p Doom_Basic_Dueling_DDQN ``` -3. MountainCar using A3C: +Some presets apply to a group of environment levels, like the entire Atari or Mujoco suites for example. +To use these presets, the requeseted level should be defined using the `-lvl` flag. + +For example: + + +* Pong using the Nerual Episodic Control (NEC) algorithm: ```bash - python3 coach.py -r -p MountainCar_A3C -n 8 + python3 coach.py -r -p Atari_NEC -lvl pong ``` -4. Doom basic level using Dueling network and Double DQN algorithm: +There are several types of agents that can benefit from running them in a distrbitued fashion with multiple workers in parallel. Each worker interacts with its own copy of the environment but updates a shared network, which improves the data collection speed and the stability of the learning process. +To specify the number of workers to run, use the `-n` flag. + +For example: +* Breakout using Asynchronous Advantage Actor-Critic (A3C) with 8 workers: ```bash - python3 coach.py -r -p Doom_Basic_Dueling_DDQN + python3 coach.py -r -p Atari_A3C -lvl breakout -n 8 ``` -5. Doom health gathering level using Mixed Monte Carlo: - - ```bash - python3 coach.py -r -p Doom_Health_MMC - ``` It is easy to create new presets for different levels or environments by following the same pattern as in presets.py -More usage examples can be found [here](http://NervanaSystems.github.io/coach/usage/index.html). +More usage examples can be found [here](https://nervanasystems.github.io/coach/usage/index.html). ### Running Coach Dashboard (Visualization) Training an agent to solve an environment can be tricky, at times. @@ -152,36 +169,14 @@ python3 dashboard.py -Coach Design - - -### Parallelizing an Algorithm - -Since the introduction of [A3C](https://arxiv.org/abs/1602.01783) in 2016, many algorithms were shown to benefit from running multiple instances in parallel, on many CPU cores. So far, these algorithms include [A3C](https://arxiv.org/abs/1602.01783), [DDPG](https://arxiv.org/pdf/1704.03073.pdf), [PPO](https://arxiv.org/pdf/1707.06347.pdf), and [NAF](https://arxiv.org/pdf/1610.00633.pdf), and this is most probably only the begining. - -Parallelizing an algorithm using Coach is straight-forward. - -The following method of NetworkWrapper parallelizes an algorithm seamlessly: - -```python -network.train_and_sync_networks(current_states, targets) -``` - -Once a parallelized run is started, the ```train_and_sync_networks``` API will apply gradients from each local worker's network to the main global network, allowing for parallel training to take place. - -Then, it merely requires running Coach with the ``` -n``` flag and with the number of workers to run with. For instance, the following command will set 16 workers to work together to train a MuJoCo Hopper: - -```bash -python3 coach.py -p Hopper_A3C -n 16 -``` - +Coach Design ## Supported Environments * *OpenAI Gym:* - Installed by default by Coach's installer. + Installed by default by Coach's installer. The version used by Coach is 0.10.5. * *ViZDoom:* @@ -189,6 +184,7 @@ python3 coach.py -p Hopper_A3C -n 16 https://github.com/mwydmuch/ViZDoom + The version currently used by Coach is 1.1.4. Additionally, Coach assumes that the environment variable VIZDOOM_ROOT points to the ViZDoom installation directory. * *Roboschool:* @@ -211,7 +207,7 @@ python3 coach.py -p Hopper_A3C -n 16 * *CARLA:* - Download release 0.7 from the CARLA repository - + Download release 0.8.4 from the CARLA repository - https://github.com/carla-simulator/carla/releases @@ -219,6 +215,22 @@ python3 coach.py -p Hopper_A3C -n 16 A simple CARLA settings file (```CarlaSettings.ini```) is supplied with Coach, and is located in the ```environments``` directory. +* *Starcraft:* + + Follow the instructions described in the PySC2 repository - + + https://github.com/deepmind/pysc2 + + The version used by Coach is 2.0.1 + +* *DeepMind Control Suite:* + + Follow the instructions described in the DeepMind Control Suite repository - + + https://github.com/deepmind/dm_control + + The version used by Coach is 0.0.0 + ## Supported Algorithms @@ -227,25 +239,47 @@ python3 coach.py -p Hopper_A3C -n 16 - -* [Deep Q Network (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) ([code](agents/dqn_agent.py)) -* [Double Deep Q Network (DDQN)](https://arxiv.org/pdf/1509.06461.pdf) ([code](agents/ddqn_agent.py)) +### Value Optimization Agents +* [Deep Q Network (DQN)](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) ([code](rl_coach/agents/dqn_agent.py)) +* [Double Deep Q Network (DDQN)](https://arxiv.org/pdf/1509.06461.pdf) ([code](rl_coach/agents/ddqn_agent.py)) * [Dueling Q Network](https://arxiv.org/abs/1511.06581) -* [Mixed Monte Carlo (MMC)](https://arxiv.org/abs/1703.01310) ([code](agents/mmc_agent.py)) -* [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860) ([code](agents/pal_agent.py)) -* [Categorical Deep Q Network (C51)](https://arxiv.org/abs/1707.06887) ([code](agents/categorical_dqn_agent.py)) -* [Quantile Regression Deep Q Network (QR-DQN)](https://arxiv.org/pdf/1710.10044v1.pdf) ([code](agents/qr_dqn_agent.py)) -* [Bootstrapped Deep Q Network](https://arxiv.org/abs/1602.04621) ([code](agents/bootstrapped_dqn_agent.py)) -* [N-Step Q Learning](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](agents/n_step_q_agent.py)) -* [Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988) ([code](agents/nec_agent.py)) -* [Normalized Advantage Functions (NAF)](https://arxiv.org/abs/1603.00748.pdf) | **Distributed** ([code](agents/naf_agent.py)) -* [Policy Gradients (PG)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | **Distributed** ([code](agents/policy_gradients_agent.py)) -* [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](agents/actor_critic_agent.py)) -* [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | **Distributed** ([code](agents/ddpg_agent.py)) -* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) ([code](agents/ppo_agent.py)) -* [Clipped Proximal Policy Optimization](https://arxiv.org/pdf/1707.06347.pdf) | **Distributed** ([code](agents/clipped_ppo_agent.py)) -* [Direct Future Prediction (DFP)](https://arxiv.org/abs/1611.01779) | **Distributed** ([code](agents/dfp_agent.py)) -* Behavioral Cloning (BC) ([code](agents/bc_agent.py)) +* [Mixed Monte Carlo (MMC)](https://arxiv.org/abs/1703.01310) ([code](rl_coach/agents/mmc_agent.py)) +* [Persistent Advantage Learning (PAL)](https://arxiv.org/abs/1512.04860) ([code](rl_coach/agents/pal_agent.py)) +* [Categorical Deep Q Network (C51)](https://arxiv.org/abs/1707.06887) ([code](rl_coach/agents/categorical_dqn_agent.py)) +* [Quantile Regression Deep Q Network (QR-DQN)](https://arxiv.org/pdf/1710.10044v1.pdf) ([code](rl_coach/agents/qr_dqn_agent.py)) +* [N-Step Q Learning](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](rl_coach/agents/n_step_q_agent.py)) +* [Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988) ([code](rl_coach/agents/nec_agent.py)) +* [Normalized Advantage Functions (NAF)](https://arxiv.org/abs/1603.00748.pdf) | **Distributed** ([code](rl_coach/agents/naf_agent.py)) + +### Policy Optimization Agents +* [Policy Gradients (PG)](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | **Distributed** ([code](rl_coach/agents/policy_gradients_agent.py)) +* [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/abs/1602.01783) | **Distributed** ([code](rl_coach/agents/actor_critic_agent.py)) +* [Deep Deterministic Policy Gradients (DDPG)](https://arxiv.org/abs/1509.02971) | **Distributed** ([code](rl_coach/agents/ddpg_agent.py)) +* [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) ([code](rl_coach/agents/ppo_agent.py)) +* [Clipped Proximal Policy Optimization (CPPO)](https://arxiv.org/pdf/1707.06347.pdf) | **Distributed** ([code](rl_coach/agents/clipped_ppo_agent.py)) +* [Generalized Advantage Estimation (GAE)](https://arxiv.org/abs/1506.02438) ([code](rl_coach/agents/actor_critic_agent.py#L86)) + +### General Agents +* [Direct Future Prediction (DFP)](https://arxiv.org/abs/1611.01779) | **Distributed** ([code](rl_coach/agents/dfp_agent.py)) + +### Imitation Learning Agents +* Behavioral Cloning (BC) ([code](rl_coach/agents/bc_agent.py)) + +### Hierarchical Reinforcement Learning Agents +* [Hierarchical Actor Critic (HAC)](https://arxiv.org/abs/1712.00948.pdf) ([code](rl_coach/agents/ddpg_hac_agent.py)) + +### Memory Types +* [Hindsight Experience Replay (HER)](https://arxiv.org/abs/1707.01495.pdf) ([code](rl_coach/memories/episodic/episodic_hindsight_experience_replay.py)) +* [Prioritized Experience Replay (PER)](https://arxiv.org/abs/1511.05952) ([code](rl_coach/memories/non_episodic/prioritized_experience_replay.py)) + +### Exploration Techniques +* E-Greedy ([code](rl_coach/exploration_policies/e_greedy.py)) +* Boltzmann ([code](rl_coach/exploration_policies/boltzmann.py)) +* Ornstein–Uhlenbeck process ([code](rl_coach/exploration_policies/ou_process.py)) +* Normal Noise ([code](rl_coach/exploration_policies/additive_noise.py)) +* Truncated Normal Noise ([code](rl_coach/exploration_policies/truncated_normal.py)) +* [Bootstrapped Deep Q Network](https://arxiv.org/abs/1602.04621) ([code](rl_coach/agents/bootstrapped_dqn_agent.py)) +* [UCB Exploration via Q-Ensembles (UCB)](https://arxiv.org/abs/1706.01502) ([code](rl_coach/exploration_policies/ucb.py)) ## Citation diff --git a/agents/__init__.py b/agents/__init__.py deleted file mode 100644 index fdbd13e..0000000 --- a/agents/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.actor_critic_agent import * -from agents.agent import * -from agents.bc_agent import * -from agents.bootstrapped_dqn_agent import * -from agents.clipped_ppo_agent import * -from agents.ddpg_agent import * -from agents.ddqn_agent import * -from agents.dfp_agent import * -from agents.dqn_agent import * -from agents.categorical_dqn_agent import * -from agents.human_agent import * -from agents.imitation_agent import * -from agents.mmc_agent import * -from agents.n_step_q_agent import * -from agents.naf_agent import * -from agents.nec_agent import * -from agents.pal_agent import * -from agents.policy_gradients_agent import * -from agents.policy_optimization_agent import * -from agents.ppo_agent import * -from agents.value_optimization_agent import * -from agents.qr_dqn_agent import * diff --git a/agents/actor_critic_agent.py b/agents/actor_critic_agent.py deleted file mode 100644 index 729e67f..0000000 --- a/agents/actor_critic_agent.py +++ /dev/null @@ -1,146 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.policy_optimization_agent import * -from logger import * -from utils import * -import scipy.signal - - -# Actor Critic - https://arxiv.org/abs/1602.01783 -class ActorCriticAgent(PolicyOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network) - self.last_gradient_update_step_idx = 0 - self.action_advantages = Signal('Advantages') - self.state_values = Signal('Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') - self.policy_loss = Signal('Policy Loss') - self.signals.append(self.action_advantages) - self.signals.append(self.state_values) - self.signals.append(self.unclipped_grads) - self.signals.append(self.value_loss) - self.signals.append(self.policy_loss) - - # Discounting function used to calculate discounted returns. - def discount(self, x, gamma): - return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] - - def get_general_advantage_estimation_values(self, rewards, values): - # values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n) - bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]]) - - # Approximation based calculation of GAE (mathematically correct only when Tmax = inf, - # although in practice works even in much smaller Tmax values, e.g. 20) - deltas = rewards + self.tp.agent.discount * values[1:] - values[:-1] - gae = self.discount(deltas, self.tp.agent.discount * self.tp.agent.gae_lambda) - - if self.tp.agent.estimate_value_using_gae: - discounted_returns = np.expand_dims(gae + values[:-1], -1) - else: - discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards, - self.tp.agent.discount)), 1)[:-1] - return gae, discounted_returns - - def learn_from_batch(self, batch): - # batch contains a list of episodes to learn from - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # get the values for the current states - result = self.main_network.online_network.predict(current_states) - current_state_values = result[0] - self.state_values.add_sample(current_state_values) - - # the targets for the state value estimator - num_transitions = len(game_overs) - state_value_head_targets = np.zeros((num_transitions, 1)) - - # estimate the advantage function - action_advantages = np.zeros((num_transitions, 1)) - - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: - if game_overs[-1]: - R = 0 - else: - R = self.main_network.online_network.predict(last_sample(next_states))[0] - - for i in reversed(range(num_transitions)): - R = rewards[i] + self.tp.agent.discount * R - state_value_head_targets[i] = R - action_advantages[i] = R - current_state_values[i] - - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: - # get bootstraps - bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0] - values = np.append(current_state_values, bootstrapped_value) - if game_overs[-1]: - values[-1] = 0 - - # get general discounted returns table - gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values) - action_advantages = np.vstack(gae_values) - else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") - - action_advantages = action_advantages.squeeze(axis=-1) - if not self.env.discrete_controls and len(actions.shape) < 2: - actions = np.expand_dims(actions, -1) - - # train - result = self.main_network.online_network.accumulate_gradients({**current_states, 'output_1_0': actions}, - [state_value_head_targets, action_advantages]) - - # logging - total_loss, losses, unclipped_grads = result[:3] - self.action_advantages.add_sample(action_advantages) - self.unclipped_grads.add_sample(unclipped_grads) - self.value_loss.add_sample(losses[0]) - self.policy_loss.add_sample(losses[1]) - - return total_loss - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - # TODO: rename curr_state -> state - - # convert to batch so we can run it through the network - curr_state = { - k: np.expand_dims(np.array(curr_state[k]), 0) - for k in curr_state.keys() - } - - if self.env.discrete_controls: - # DISCRETE - state_value, action_probabilities = self.main_network.online_network.predict(curr_state) - action_probabilities = action_probabilities.squeeze() - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_probabilities) - else: - action = np.argmax(action_probabilities) - action_info = {"action_probability": action_probabilities[action], "state_value": state_value} - self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities + eps))) - else: - # CONTINUOUS - state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state) - action_values_mean = action_values_mean.squeeze() - action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: - action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) - else: - action = action_values_mean - action_info = {"action_probability": action, "state_value": state_value} - - return action, action_info diff --git a/agents/agent.py b/agents/agent.py deleted file mode 100644 index 888f1b2..0000000 --- a/agents/agent.py +++ /dev/null @@ -1,580 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import scipy.ndimage -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") - -import copy -from renderer import Renderer -from configurations import Preset -from collections import deque -from utils import LazyStack -from collections import OrderedDict -from utils import RunPhase, Signal, is_empty, RunningStat -from architectures import * -from exploration_policies import * -from memories import * -from memories.memory import * -from logger import logger, screen -import random -import time -import os -import itertools -from architectures.tensorflow_components.shared_variables import SharedRunningStats -from six.moves import range - - -class Agent(object): - def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0): - """ - :param env: An environment instance - :type env: EnvironmentWrapper - :param tuning_parameters: A Preset class instance with all the running paramaters - :type tuning_parameters: Preset - :param replicated_device: A tensorflow device for distributed training (optional) - :type replicated_device: instancemethod - :param thread_id: The current thread id - :param thread_id: int - """ - - screen.log_title("Creating agent {}".format(task_id)) - self.task_id = task_id - self.sess = tuning_parameters.sess - self.env = tuning_parameters.env_instance = env - self.imitation = False - - # i/o dimensions - if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height: - tuning_parameters.env.desired_observation_width = self.env.width - tuning_parameters.env.desired_observation_height = self.env.height - self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size - self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size - if tuning_parameters.agent.use_accumulated_reward_as_measurement: - self.measurements_size = tuning_parameters.env.measurements_size = (self.measurements_size[0] + 1,) - - # modules - if tuning_parameters.agent.load_memory_from_file_path: - screen.log_title("Loading replay buffer from pickle. Pickle path: {}" - .format(tuning_parameters.agent.load_memory_from_file_path)) - self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path) - else: - self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') - # self.architecture = eval(tuning_parameters.architecture) - - self.has_global = replicated_device is not None - self.replicated_device = replicated_device - self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0" - - self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') - self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy - + '(tuning_parameters)') - self.evaluation_exploration_policy.change_phase(RunPhase.TEST) - - # initialize all internal variables - self.tp = tuning_parameters - self.in_heatup = False - self.total_reward_in_current_episode = 0 - self.total_steps_counter = 0 - self.running_reward = None - self.training_iteration = 0 - self.current_episode = self.tp.current_episode = 0 - self.curr_state = {} - self.current_episode_steps_counter = 0 - self.episode_running_info = {} - self.last_episode_evaluation_ran = 0 - self.running_observations = [] - logger.set_current_time(self.current_episode) - self.main_network = None - self.networks = [] - self.last_episode_images = [] - self.renderer = Renderer() - - # signals - self.signals = [] - self.loss = Signal('Loss') - self.signals.append(self.loss) - self.curr_learning_rate = Signal('Learning Rate') - self.signals.append(self.curr_learning_rate) - - if self.tp.env.normalize_observation and not self.env.is_state_type_image: - if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: - self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,)) - self.running_reward_stats = RunningStat(()) - if self.tp.checkpoint_restore_dir: - checkpoint_path = os.path.join(self.tp.checkpoint_restore_dir, "running_stats.p") - self.running_observation_stats = read_pickle(checkpoint_path) - else: - self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,)) - self.running_reward_stats = RunningStat(()) - else: - self.running_observation_stats = SharedRunningStats(self.tp, replicated_device, - shape=(self.tp.env.desired_observation_width,), - name='observation_stats') - self.running_reward_stats = SharedRunningStats(self.tp, replicated_device, - shape=(), - name='reward_stats') - - # env is already reset at this point. Otherwise we're getting an error where you cannot - # reset an env which is not done - self.reset_game(do_not_reset_env=True) - - # use seed - if self.tp.seed is not None: - random.seed(self.tp.seed) - np.random.seed(self.tp.seed) - - def log_to_screen(self, phase): - # log to screen - if self.current_episode >= 0: - if phase == RunPhase.TRAIN: - exploration = self.exploration_policy.get_control_param() - else: - exploration = self.evaluation_exploration_policy.get_control_param() - - screen.log_dict( - OrderedDict([ - ("Worker", self.task_id), - ("Episode", self.current_episode), - ("total reward", self.total_reward_in_current_episode), - ("exploration", exploration), - ("steps", self.total_steps_counter), - ("training iteration", self.training_iteration) - ]), - prefix=phase - ) - - def update_log(self, phase=RunPhase.TRAIN): - """ - Writes logging messages to screen and updates the log file with all the signal values. - :return: None - """ - # log all the signals to file - logger.set_current_time(self.current_episode) - logger.create_signal_value('Training Iter', self.training_iteration) - logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP)) - logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) - logger.create_signal_value('ER #Episodes', self.memory.length()) - logger.create_signal_value('Episode Length', self.current_episode_steps_counter) - logger.create_signal_value('Total steps', self.total_steps_counter) - logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) - logger.create_signal_value("Training Reward", self.total_reward_in_current_episode - if phase == RunPhase.TRAIN else np.nan) - logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode - if phase == RunPhase.TEST else np.nan) - logger.create_signal_value('Update Target Network', 0, overwrite=False) - logger.update_wall_clock_time(self.current_episode) - - for signal in self.signals: - logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) - logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) - logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) - logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) - - # dump - if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \ - and self.current_episode > 0: - logger.dump_output_csv() - - def reset_game(self, do_not_reset_env=False): - """ - Resets all the episodic parameters and start a new environment episode. - :param do_not_reset_env: A boolean that allows prevention of environment reset - :return: None - """ - - for signal in self.signals: - signal.reset() - self.total_reward_in_current_episode = 0 - self.curr_state = {} - self.last_episode_images = [] - self.current_episode_steps_counter = 0 - self.episode_running_info = {} - if not do_not_reset_env: - self.env.reset() - self.exploration_policy.reset() - - # required for online plotting - if self.tp.visualization.plot_action_values_online: - if hasattr(self, 'episode_running_info') and hasattr(self.env, 'actions_description'): - for action in self.env.actions_description: - self.episode_running_info[action] = [] - plt.clf() - - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: - for network in self.networks: - network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init - network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init - - self.prepare_initial_state() - - def preprocess_observation(self, observation): - """ - Preprocesses the given observation. - For images - convert to grayscale, resize and convert to int. - For measurements vectors - normalize by a running average and std. - :param observation: The agents observation - :return: A processed version of the observation - """ - - if self.env.is_state_type_image: - # rescale - observation = scipy.misc.imresize(observation, - (self.tp.env.desired_observation_height, - self.tp.env.desired_observation_width), - interp=self.tp.rescaling_interpolation_type) - # rgb to y - if len(observation.shape) > 2 and observation.shape[2] > 1: - r, g, b = observation[:, :, 0], observation[:, :, 1], observation[:, :, 2] - observation = 0.2989 * r + 0.5870 * g + 0.1140 * b - - # Render the processed observation which is how the agent will see it - # Warning: this cannot currently be done in parallel to rendering the environment - if self.tp.visualization.render_observation: - if not self.renderer.is_open: - self.renderer.create_screen(observation.shape[0], observation.shape[1]) - self.renderer.render_image(observation) - - return observation.astype('uint8') - else: - if self.tp.env.normalize_observation and self.sess is not None: - # standardize the input observation using a running mean and std - if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: - self.running_observation_stats.push(observation) - observation = (observation - self.running_observation_stats.mean) / \ - (self.running_observation_stats.std + 1e-15) - observation = np.clip(observation, -5.0, 5.0) - return observation - - def learn_from_batch(self, batch): - """ - Given a batch of transitions, calculates their target values and updates the network. - :param batch: A list of transitions - :return: The loss of the training - """ - pass - - def train(self): - """ - A single training iteration. Sample a batch, train on it and update target networks. - :return: The training loss. - """ - batch = self.memory.sample(self.tp.batch_size) - loss = self.learn_from_batch(batch) - - if self.tp.learning_rate_decay_rate != 0: - self.curr_learning_rate.add_sample(self.tp.sess.run(self.tp.learning_rate)) - else: - self.curr_learning_rate.add_sample(self.tp.learning_rate) - - # update the target network of every network that has a target network - if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: - for network in self.networks: - network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target) - logger.create_signal_value('Update Target Network', 1) - else: - logger.create_signal_value('Update Target Network', 0, overwrite=False) - - return loss - - def extract_batch(self, batch): - """ - Extracts a single numpy array for each object in a batch of transitions (state, action, etc.) - :param batch: An array of transitions - :return: For each transition element, returns a numpy array of all the transitions in the batch - """ - current_states = {} - next_states = {} - current_states['observation'] = np.array([np.array(transition.state['observation']) for transition in batch]) - next_states['observation'] = np.array([np.array(transition.next_state['observation']) for transition in batch]) - actions = np.array([transition.action for transition in batch]) - rewards = np.array([transition.reward for transition in batch]) - game_overs = np.array([transition.game_over for transition in batch]) - total_return = np.array([transition.total_return for transition in batch]) - - # get the entire state including measurements if available - if self.tp.agent.use_measurements: - current_states['measurements'] = np.array([transition.state['measurements'] for transition in batch]) - next_states['measurements'] = np.array([transition.next_state['measurements'] for transition in batch]) - - return current_states, next_states, actions, rewards, game_overs, total_return - - def plot_action_values_online(self): - """ - Plot an animated graph of the value of each possible action during the episode - :return: None - """ - - plt.clf() - for key, data_list in self.episode_running_info.items(): - plt.plot(data_list, label=key) - plt.legend() - plt.pause(0.00000001) - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - """ - choose an action to act with in the current episode being played. Different behavior might be exhibited when training - or testing. - - :param curr_state: the current state to act upon. - :param phase: the current phase: training or testing. - :return: chosen action, some action value describing the action (q-value, probability, etc) - """ - pass - - def preprocess_reward(self, reward): - if self.tp.env.reward_scaling: - reward /= float(self.tp.env.reward_scaling) - if self.tp.env.reward_clipping_max: - reward = min(reward, self.tp.env.reward_clipping_max) - if self.tp.env.reward_clipping_min: - reward = max(reward, self.tp.env.reward_clipping_min) - return reward - - def tf_input_state(self, curr_state): - """ - convert curr_state into input tensors tensorflow is expecting. - """ - # add batch axis with length 1 onto each value - # extract values from the state based on agent.input_types - input_state = {} - for input_name in self.tp.agent.input_types.keys(): - input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0) - return input_state - - def prepare_initial_state(self): - """ - Create an initial state when starting a new episode - :return: None - """ - observation = self.preprocess_observation(self.env.state['observation']) - self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) - observation = LazyStack(self.curr_stack, -1) - - self.curr_state = { - 'observation': observation - } - if self.tp.agent.use_measurements: - if 'measurements' in self.env.state.keys(): - self.curr_state['measurements'] = self.env.state['measurements'] - else: - self.curr_state['measurements'] = np.zeros(0) - if self.tp.agent.use_accumulated_reward_as_measurement: - self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) - - def act(self, phase=RunPhase.TRAIN): - """ - Take one step in the environment according to the network prediction and store the transition in memory - :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored - :return: A boolean value that signals an episode termination - """ - - if phase != RunPhase.TEST: - self.total_steps_counter += 1 - self.current_episode_steps_counter += 1 - - # get new action - action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0} - - if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: - action = self.env.get_random_action() - else: - action, action_info = self.choose_action(self.curr_state, phase=phase) - - # perform action - if type(action) == np.ndarray: - action = action.squeeze() - result = self.env.step(action) - - shaped_reward = self.preprocess_reward(result['reward']) - if 'action_intrinsic_reward' in action_info.keys(): - shaped_reward += action_info['action_intrinsic_reward'] - # TODO: should total_reward_in_current_episode include shaped_reward? - self.total_reward_in_current_episode += result['reward'] - next_state = copy.copy(result['state']) - next_state['observation'] = self.preprocess_observation(next_state['observation']) - - # plot action values online - if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: - self.plot_action_values_online() - - # initialize the next state - # TODO: provide option to stack more than just the observation - self.curr_stack.append(next_state['observation']) - observation = LazyStack(self.curr_stack, -1) - - next_state['observation'] = observation - if self.tp.agent.use_measurements: - if 'measurements' in result['state'].keys(): - next_state['measurements'] = result['state']['measurements'] - else: - next_state['measurements'] = np.zeros(0) - if self.tp.agent.use_accumulated_reward_as_measurement: - next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) - - # store the transition only if we are training - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: - transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) - for key in action_info.keys(): - transition.info[key] = action_info[key] - if self.tp.agent.add_a_normalized_timestep_to_the_observation: - transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit - self.memory.store(transition) - elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs: - # we store the transitions only for saving gifs - self.last_episode_images.append(self.env.get_rendered_image()) - - # update the current state for the next step - self.curr_state = next_state - - # deal with episode termination - if result['done']: - if self.tp.visualization.dump_csv: - self.update_log(phase=phase) - self.log_to_screen(phase=phase) - - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: - self.reset_game() - - self.current_episode += 1 - self.tp.current_episode = self.current_episode - - # return episode really ended - return result['done'] - - def evaluate(self, num_episodes, keep_networks_synced=False): - """ - Run in an evaluation mode for several episodes. Actions will be chosen greedily. - :param keep_networks_synced: keep the online network in sync with the global network after every episode - :param num_episodes: The number of episodes to evaluate on - :return: None - """ - - max_reward_achieved = -float('inf') - average_evaluation_reward = 0 - screen.log_title("Running evaluation") - self.env.change_phase(RunPhase.TEST) - for i in range(num_episodes): - # keep the online network in sync with the global network - if keep_networks_synced: - for network in self.networks: - network.sync() - - episode_ended = False - while not episode_ended: - episode_ended = self.act(phase=RunPhase.TEST) - - if keep_networks_synced \ - and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps: - for network in self.networks: - network.sync() - - if self.total_reward_in_current_episode > max_reward_achieved: - max_reward_achieved = self.total_reward_in_current_episode - frame_skipping = int(5/self.tp.env.frame_skip) - if self.tp.visualization.dump_gifs: - logger.create_gif(self.last_episode_images[::frame_skipping], - name='score-{}'.format(max_reward_achieved), fps=10) - - average_evaluation_reward += self.total_reward_in_current_episode - self.reset_game() - - average_evaluation_reward /= float(num_episodes) - - self.env.change_phase(RunPhase.TRAIN) - screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward)) - - def post_training_commands(self): - pass - - def improve(self): - """ - Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ] - - :return: None - """ - - # synchronize the online network weights with the global network - for network in self.networks: - network.sync() - - # heatup phase - if self.tp.num_heatup_steps != 0: - self.in_heatup = True - screen.log_title("Starting heatup {}".format(self.task_id)) - num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size - for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)): - self.act(phase=RunPhase.HEATUP) - - # training phase - self.in_heatup = False - screen.log_title("Starting training {}".format(self.task_id)) - self.exploration_policy.change_phase(RunPhase.TRAIN) - training_start_time = time.time() - model_snapshots_periods_passed = -1 - self.reset_game() - - while self.training_iteration < self.tp.num_training_iterations: - # evaluate - evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \ - (self.current_episode % self.tp.evaluate_every_x_episodes == 0) - evaluate_agent = evaluate_agent or \ - (self.imitation and self.training_iteration > 0 and - self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0) - - if evaluate_agent: - self.env.reset(force_environment_reset=True) - self.last_episode_evaluation_ran = self.current_episode - self.evaluate(self.tp.evaluation_episodes) - - # snapshot model - if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed: - total_training_time = time.time() - training_start_time - current_snapshot_period = (int(total_training_time) // self.tp.save_model_sec) - if current_snapshot_period > model_snapshots_periods_passed: - model_snapshots_periods_passed = current_snapshot_period - self.save_model(model_snapshots_periods_passed) - if hasattr(self, 'running_observation_state') and self.running_observation_stats is not None: - to_pickle(self.running_observation_stats, - os.path.join(self.tp.save_model_dir, - "running_stats.p".format(model_snapshots_periods_passed))) - - # play and record in replay buffer - if self.tp.agent.collect_new_data: - if self.tp.agent.step_until_collecting_full_episodes: - step = 0 - while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0: - self.act() - step += 1 - else: - for step in range(self.tp.agent.num_consecutive_playing_steps): - self.act() - - # train - if self.tp.train: - for step in range(self.tp.agent.num_consecutive_training_steps): - loss = self.train() - self.loss.add_sample(loss) - self.training_iteration += 1 - if self.imitation: - self.log_to_screen(RunPhase.TRAIN) - self.post_training_commands() - - def save_model(self, model_id): - self.main_network.save_model(model_id) diff --git a/agents/bc_agent.py b/agents/bc_agent.py deleted file mode 100644 index 70fe3e6..0000000 --- a/agents/bc_agent.py +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np - -from agents.imitation_agent import ImitationAgent - - -# Behavioral Cloning Agent -class BCAgent(ImitationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - - def learn_from_batch(self, batch): - current_states, _, actions, _, _, _ = self.extract_batch(batch) - - # the targets for the network are the actions since this is supervised learning - if self.env.discrete_controls: - targets = np.eye(self.env.action_space_size)[[actions]] - else: - targets = actions - - result = self.main_network.train_and_sync_networks(current_states, targets) - total_loss = result[0] - - return total_loss diff --git a/agents/bootstrapped_dqn_agent.py b/agents/bootstrapped_dqn_agent.py deleted file mode 100644 index 3476022..0000000 --- a/agents/bootstrapped_dqn_agent.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf -class BootstrappedDQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - - def reset_game(self, do_not_reset_env=False): - ValueOptimizationAgent.reset_game(self, do_not_reset_env) - self.exploration_policy.select_head() - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # for the action we actually took, the error is: - # TD error = r + discount*max(q_st_plus_1) - q_st - # for all other actions, the error is 0 - q_st_plus_1 = self.main_network.target_network.predict(next_states) - # initialize with the current prediction so that we will - TD_targets = self.main_network.online_network.predict(current_states) - - # only update the action that we have actually done in this transition - for i in range(self.tp.batch_size): - mask = batch[i].info['mask'] - for head_idx in range(self.tp.exploration.architecture_num_q_heads): - if mask[head_idx] == 1: - TD_targets[head_idx][i, actions[i]] = rewards[i] + \ - (1.0 - game_overs[i]) * self.tp.agent.discount * np.max( - q_st_plus_1[head_idx][i], 0) - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - - total_loss = result[0] - - return total_loss - - def act(self, phase=RunPhase.TRAIN): - ValueOptimizationAgent.act(self, phase) - mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability, - self.tp.exploration.architecture_num_q_heads) - self.memory.update_last_transition_info({'mask': mask}) diff --git a/agents/categorical_dqn_agent.py b/agents/categorical_dqn_agent.py deleted file mode 100644 index dec8ba2..0000000 --- a/agents/categorical_dqn_agent.py +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class CategoricalDQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) - - # prediction's format is (batch,actions,atoms) - def get_q_values(self, prediction): - return np.dot(prediction, self.z_values) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # for the action we actually took, the error is calculated by the atoms distribution - # for all other actions, the error is 0 - distributed_q_st_plus_1 = self.main_network.target_network.predict(next_states) - # initialize with the current prediction so that we will - TD_targets = self.main_network.online_network.predict(current_states) - - # only update the action that we have actually done in this transition - target_actions = np.argmax(self.get_q_values(distributed_q_st_plus_1), axis=1) - m = np.zeros((self.tp.batch_size, self.z_values.size)) - - batches = np.arange(self.tp.batch_size) - for j in range(self.z_values.size): - tzj = np.fmax(np.fmin(rewards + (1.0 - game_overs) * self.tp.agent.discount * self.z_values[j], - self.z_values[self.z_values.size - 1]), - self.z_values[0]) - bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0]) - u = (np.ceil(bj)).astype(int) - l = (np.floor(bj)).astype(int) - m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj)) - m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l)) - # total_loss = cross entropy between actual result above and predicted result for the given action - TD_targets[batches, actions] = m - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss - diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py deleted file mode 100644 index ad066ae..0000000 --- a/agents/clipped_ppo_agent.py +++ /dev/null @@ -1,212 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.actor_critic_agent import * -from random import shuffle - - -# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 -class ClippedPPOAgent(ActorCriticAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) - # signals definition - self.value_loss = Signal('Value Loss') - self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') - self.signals.append(self.policy_loss) - self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') - self.signals.append(self.unclipped_grads) - self.value_targets = Signal('Value Targets') - self.signals.append(self.value_targets) - self.kl_divergence = Signal('KL Divergence') - self.signals.append(self.kl_divergence) - - def fill_advantages(self, batch): - current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - - current_state_values = self.main_network.online_network.predict(current_states)[0] - current_state_values = current_state_values.squeeze() - self.state_values.add_sample(current_state_values) - - # calculate advantages - advantages = [] - value_targets = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: - advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: - # get bootstraps - episode_start_idx = 0 - advantages = np.array([]) - value_targets = np.array([]) - for idx, game_over in enumerate(game_overs): - if game_over: - # get advantages for the rollout - value_bootstrapping = np.zeros((1,)) - rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping) - - rollout_advantages, gae_based_value_targets = \ - self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1], - rollout_state_values) - episode_start_idx = idx + 1 - advantages = np.append(advantages, rollout_advantages) - value_targets = np.append(value_targets, gae_based_value_targets) - else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") - - # standardize - advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) - - for transition, advantage, value_target in zip(batch, advantages, value_targets): - transition.info['advantage'] = advantage - transition.info['gae_based_value_target'] = value_target - - self.action_advantages.add_sample(advantages) - - def train_network(self, dataset, epochs): - loss = [] - for j in range(epochs): - loss = { - 'total_loss': [], - 'policy_losses': [], - 'unclipped_grads': [], - 'fetch_result': [] - } - shuffle(dataset) - for i in range(int(len(dataset) / self.tp.batch_size)): - batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size] - current_states, _, actions, _, _, total_return = self.extract_batch(batch) - - advantages = np.array([t.info['advantage'] for t in batch]) - gae_based_value_targets = np.array([t.info['gae_based_value_target'] for t in batch]) - if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1: - actions = np.expand_dims(actions, -1) - - # get old policy probabilities and distribution - result = self.main_network.target_network.predict(current_states) - old_policy_distribution = result[1:] - - # calculate gradients and apply on both the local policy network and on the global policy network - fetches = [self.main_network.online_network.output_heads[1].kl_divergence, - self.main_network.online_network.output_heads[1].entropy] - - total_return = np.expand_dims(total_return, -1) - value_targets = gae_based_value_targets if self.tp.agent.estimate_value_using_gae else total_return - inputs = copy.copy(current_states) - # TODO: why is this output 0 and not output 1? - inputs['output_0_0'] = actions - # TODO: does old_policy_distribution really need to be represented as a list? - # A: yes it does, in the event of discrete controls, it has just a mean - # otherwise, it has both a mean and standard deviation - for input_index, input in enumerate(old_policy_distribution): - inputs['output_0_{}'.format(input_index + 1)] = input - total_loss, policy_losses, unclipped_grads, fetch_result =\ - self.main_network.online_network.accumulate_gradients( - inputs, [total_return, advantages], additional_fetches=fetches) - - self.value_targets.add_sample(value_targets) - if self.tp.distributed: - self.main_network.apply_gradients_to_global_network() - self.main_network.update_online_network() - else: - self.main_network.apply_gradients_to_online_network() - - self.main_network.online_network.reset_accumulated_gradients() - - loss['total_loss'].append(total_loss) - loss['policy_losses'].append(policy_losses) - loss['unclipped_grads'].append(unclipped_grads) - loss['fetch_result'].append(fetch_result) - - self.unclipped_grads.add_sample(unclipped_grads) - - for key in loss.keys(): - loss[key] = np.mean(loss[key], 0) - - if self.tp.learning_rate_decay_rate != 0: - curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate) - self.curr_learning_rate.add_sample(curr_learning_rate) - else: - curr_learning_rate = self.tp.learning_rate - - # log training parameters - screen.log_dict( - OrderedDict([ - ("Surrogate loss", loss['policy_losses'][0]), - ("KL divergence", loss['fetch_result'][0]), - ("Entropy", loss['fetch_result'][1]), - ("training epoch", j), - ("learning_rate", curr_learning_rate) - ]), - prefix="Policy training" - ) - - self.total_kl_divergence_during_training_process = loss['fetch_result'][0] - self.entropy.add_sample(loss['fetch_result'][1]) - self.kl_divergence.add_sample(loss['fetch_result'][0]) - return policy_losses - - def post_training_commands(self): - - # clean memory - self.memory.clean() - - def train(self): - self.main_network.sync() - - dataset = self.memory.transitions - - self.fill_advantages(dataset) - - # take only the requested number of steps - dataset = dataset[:self.tp.agent.num_consecutive_playing_steps] - - if self.tp.distributed and self.tp.agent.share_statistics_between_workers: - self.running_observation_stats.push(np.array([np.array(t.state['observation']) for t in dataset])) - - losses = self.train_network(dataset, 10) - self.value_loss.add_sample(losses[0]) - self.policy_loss.add_sample(losses[1]) - self.update_log() # should be done in order to update the data that has been accumulated * while not playing * - return np.append(losses[0], losses[1]) - - def choose_action(self, current_state, phase=RunPhase.TRAIN): - if self.env.discrete_controls: - # DISCRETE - _, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state)) - action_values = action_values.squeeze() - - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = np.argmax(action_values) - action_info = {"action_probability": action_values[action]} - # self.entropy.add_sample(-np.sum(action_values * np.log(action_values))) - else: - # CONTINUOUS - _, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state)) - action_values_mean = action_values_mean.squeeze() - action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: - action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) - # if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5: - # print action - else: - action = action_values_mean - action_info = {"action_probability": action_values_mean} - - return action, action_info diff --git a/agents/ddpg_agent.py b/agents/ddpg_agent.py deleted file mode 100644 index 425f1de..0000000 --- a/agents/ddpg_agent.py +++ /dev/null @@ -1,109 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.actor_critic_agent import * -from configurations import * - - -# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf -class DDPGAgent(ActorCriticAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) - # define critic network - self.critic_network = self.main_network - # self.networks.append(self.critic_network) - - # define actor network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.Pi] - self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor', - self.replicated_device, self.worker_device) - self.networks.append(self.actor_network) - - self.q_values = Signal("Q") - self.signals.append(self.q_values) - - self.reset_game(do_not_reset_env=True) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # TD error = r + discount*max(q_st_plus_1) - q_st - next_actions = self.actor_network.target_network.predict(next_states) - inputs = copy.copy(next_states) - inputs['action'] = next_actions - q_st_plus_1 = self.critic_network.target_network.predict(inputs) - TD_targets = np.expand_dims(rewards, -1) + \ - (1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * q_st_plus_1 - - # get the gradients of the critic output with respect to the action - actions_mean = self.actor_network.online_network.predict(current_states) - critic_online_network = self.critic_network.online_network - # TODO: convert into call to predict, current method ignores lstm middleware for example - action_gradients = self.critic_network.sess.run(critic_online_network.gradients_wrt_inputs['action'], - feed_dict=critic_online_network._feed_dict({ - **current_states, - 'action': actions_mean, - }))[0] - - # train the critic - if len(actions.shape) == 1: - actions = np.expand_dims(actions, -1) - result = self.critic_network.train_and_sync_networks({**current_states, 'action': actions}, TD_targets) - total_loss = result[0] - - # apply the gradients from the critic to the actor - actor_online_network = self.actor_network.online_network - gradients = self.actor_network.sess.run(actor_online_network.weighted_gradients, - feed_dict=actor_online_network._feed_dict({ - **current_states, - actor_online_network.gradients_weights_ph: -action_gradients, - })) - if self.actor_network.has_global: - self.actor_network.global_network.apply_gradients(gradients) - self.actor_network.update_online_network() - else: - self.actor_network.online_network.apply_gradients(gradients) - - return total_loss - - def train(self): - return Agent.train(self) - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - assert not self.env.discrete_controls, 'DDPG works only for continuous control problems' - result = self.actor_network.online_network.predict(self.tf_input_state(curr_state)) - action_values = result[0].squeeze() - - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = action_values - - action = np.clip(action, self.env.action_space_low, self.env.action_space_high) - - # get q value - action_batch = np.expand_dims(action, 0) - if type(action) != np.ndarray: - action_batch = np.array([[action]]) - inputs = self.tf_input_state(curr_state) - inputs['action'] = action_batch - q_value = self.critic_network.online_network.predict(inputs)[0] - self.q_values.add_sample(q_value) - action_info = {"action_value": q_value} - - return action, action_info diff --git a/agents/ddqn_agent.py b/agents/ddqn_agent.py deleted file mode 100644 index 838ae3f..0000000 --- a/agents/ddqn_agent.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Double DQN - https://arxiv.org/abs/1509.06461 -class DDQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1) - q_st_plus_1 = self.main_network.target_network.predict(next_states) - TD_targets = self.main_network.online_network.predict(current_states) - - # initialize with the current prediction so that we will - # only update the action that we have actually done in this transition - for i in range(self.tp.batch_size): - TD_targets[i, actions[i]] = rewards[i] \ - + (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][ - selected_actions[i]] - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss diff --git a/agents/dfp_agent.py b/agents/dfp_agent.py deleted file mode 100644 index 8f98b94..0000000 --- a/agents/dfp_agent.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.agent import * - - -# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf -class DFPAgent(Agent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.current_goal = self.tp.agent.goal_vector - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) - self.networks.append(self.main_network) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch) - - # create the inputs for the network - input = current_states - input['goal'] = np.repeat(np.expand_dims(self.current_goal, 0), self.tp.batch_size, 0) - - # get the current outputs of the network - targets = self.main_network.online_network.predict(input) - - # change the targets for the taken actions - for i in range(self.tp.batch_size): - targets[i, actions[i]] = batch[i].info['future_measurements'].flatten() - - result = self.main_network.train_and_sync_networks(input, targets) - total_loss = result[0] - - return total_loss - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - # convert to batch so we can run it through the network - observation = np.expand_dims(np.array(curr_state['observation']), 0) - measurements = np.expand_dims(np.array(curr_state['measurements']), 0) - goal = np.expand_dims(self.current_goal, 0) - - # predict the future measurements - measurements_future_prediction = self.main_network.online_network.predict({ - "observation": observation, - "measurements": measurements, - "goal": goal})[0] - action_values = np.zeros((self.action_space_size,)) - num_steps_used_for_objective = len(self.tp.agent.future_measurements_weights) - - # calculate the score of each action by multiplying it's future measurements with the goal vector - for action_idx in range(self.action_space_size): - action_measurements = measurements_future_prediction[action_idx] - action_measurements = np.reshape(action_measurements, - (self.tp.agent.num_predicted_steps_ahead, self.measurements_size[0])) - future_steps_values = np.dot(action_measurements, self.current_goal) - action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:], - self.tp.agent.future_measurements_weights) - - # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = np.argmax(action_values) - - action_values = action_values.squeeze() - - # store information for plotting interactively (actual plotting is done in agent) - if self.tp.visualization.plot_action_values_online: - for idx, action_name in enumerate(self.env.actions_description): - self.episode_running_info[action_name].append(action_values[idx]) - - action_info = {"action_probability": 0, "action_value": action_values[action]} - - return action, action_info diff --git a/agents/distributional_dqn_agent.py b/agents/distributional_dqn_agent.py deleted file mode 100644 index d7c0088..0000000 --- a/agents/distributional_dqn_agent.py +++ /dev/null @@ -1,60 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class DistributionalDQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) - - # prediction's format is (batch,actions,atoms) - def get_q_values(self, prediction): - return np.dot(prediction, self.z_values) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # for the action we actually took, the error is calculated by the atoms distribution - # for all other actions, the error is 0 - distributed_q_st_plus_1 = self.main_network.target_network.predict(next_states) - # initialize with the current prediction so that we will - TD_targets = self.main_network.online_network.predict(current_states) - - # only update the action that we have actually done in this transition - target_actions = np.argmax(self.get_q_values(distributed_q_st_plus_1), axis=1) - m = np.zeros((self.tp.batch_size, self.z_values.size)) - - batches = np.arange(self.tp.batch_size) - for j in range(self.z_values.size): - tzj = np.fmax(np.fmin(rewards + (1.0 - game_overs) * self.tp.agent.discount * self.z_values[j], - self.z_values[self.z_values.size - 1]), - self.z_values[0]) - bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0]) - u = (np.ceil(bj)).astype(int) - l = (np.floor(bj)).astype(int) - m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj)) - m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l)) - # total_loss = cross entropy between actual result above and predicted result for the given action - TD_targets[batches, actions] = m - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss - diff --git a/agents/dqn_agent.py b/agents/dqn_agent.py deleted file mode 100644 index 70c0c7d..0000000 --- a/agents/dqn_agent.py +++ /dev/null @@ -1,43 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf -class DQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # for the action we actually took, the error is: - # TD error = r + discount*max(q_st_plus_1) - q_st - # for all other actions, the error is 0 - q_st_plus_1 = self.main_network.target_network.predict(next_states) - # initialize with the current prediction so that we will - TD_targets = self.main_network.online_network.predict(current_states) - - # only update the action that we have actually done in this transition - for i in range(self.tp.batch_size): - TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max( - q_st_plus_1[i], 0) - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss diff --git a/agents/human_agent.py b/agents/human_agent.py deleted file mode 100644 index c75c2a2..0000000 --- a/agents/human_agent.py +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.agent import * -import pygame - - -class HumanAgent(Agent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - - self.clock = pygame.time.Clock() - self.max_fps = int(self.tp.visualization.max_fps_for_human_control) - - screen.log_title("Human Control Mode") - available_keys = self.env.get_available_keys() - if available_keys: - screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") - screen.log("") - for action, key in self.env.get_available_keys(): - screen.log("\t- {}: {}".format(action, key)) - screen.separator() - - def train(self): - return 0 - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - action = self.env.get_action_from_user() - - # keep constant fps - self.clock.tick(self.max_fps) - - if not self.env.renderer.is_open: - self.save_replay_buffer_and_exit() - - return action, {"action_value": 0} - - def save_replay_buffer_and_exit(self): - replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p') - self.memory.tp = None - to_pickle(self.memory, replay_buffer_path) - screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path)) - exit() - - def log_to_screen(self, phase): - # log to screen - screen.log_dict( - OrderedDict([ - ("Episode", self.current_episode), - ("total reward", self.total_reward_in_current_episode), - ("steps", self.total_steps_counter) - ]), - prefix="Recording" - ) diff --git a/agents/imitation_agent.py b/agents/imitation_agent.py deleted file mode 100644 index f893fbe..0000000 --- a/agents/imitation_agent.py +++ /dev/null @@ -1,65 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.agent import * - - -# Imitation Agent -class ImitationAgent(Agent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) - self.networks.append(self.main_network) - self.imitation = True - - def extract_action_values(self, prediction): - return prediction.squeeze() - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - # convert to batch so we can run it through the network - prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state)) - - # get action values and extract the best action from it - action_values = self.extract_action_values(prediction) - if self.env.discrete_controls: - # DISCRETE - # action = np.argmax(action_values) - action = self.evaluation_exploration_policy.get_action(action_values) - action_value = {"action_probability": action_values[action]} - else: - # CONTINUOUS - action = action_values - action_value = {} - - return action, action_value - - def log_to_screen(self, phase): - # log to screen - if phase == RunPhase.TRAIN: - # for the training phase - we log during the episode to visualize the progress in training - screen.log_dict( - OrderedDict([ - ("Worker", self.task_id), - ("Episode", self.current_episode), - ("Loss", self.loss.values[-1]), - ("Training iteration", self.training_iteration) - ]), - prefix="Training" - ) - else: - # for the evaluation phase - logging as in regular RL - Agent.log_to_screen(self, phase) diff --git a/agents/mmc_agent.py b/agents/mmc_agent.py deleted file mode 100644 index 2b5a2cb..0000000 --- a/agents/mmc_agent.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -class MixedMonteCarloAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - - TD_targets = self.main_network.online_network.predict(current_states) - selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1) - q_st_plus_1 = self.main_network.target_network.predict(next_states) - # initialize with the current prediction so that we will - # only update the action that we have actually done in this transition - for i in range(self.tp.batch_size): - one_step_target = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][ - selected_actions[i]] - monte_carlo_target = total_return[i] - TD_targets[i, actions[i]] = (1 - self.mixing_rate) * one_step_target + self.mixing_rate * monte_carlo_target - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss diff --git a/agents/n_step_q_agent.py b/agents/n_step_q_agent.py deleted file mode 100644 index 5a74fb5..0000000 --- a/agents/n_step_q_agent.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import numpy as np -import scipy.signal - -from agents.value_optimization_agent import ValueOptimizationAgent -from agents.policy_optimization_agent import PolicyOptimizationAgent -from logger import logger -from utils import Signal, last_sample - - -# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783 -class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) - self.last_gradient_update_step_idx = 0 - self.q_values = Signal('Q Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') - self.signals.append(self.q_values) - self.signals.append(self.unclipped_grads) - self.signals.append(self.value_loss) - - def learn_from_batch(self, batch): - # batch contains a list of episodes to learn from - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # get the values for the current states - state_value_head_targets = self.main_network.online_network.predict(current_states) - - # the targets for the state value estimator - num_transitions = len(game_overs) - - if self.tp.agent.targets_horizon == '1-Step': - # 1-Step Q learning - q_st_plus_1 = self.main_network.target_network.predict(next_states) - - for i in reversed(range(num_transitions)): - state_value_head_targets[i][actions[i]] = \ - rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0) - - elif self.tp.agent.targets_horizon == 'N-Step': - # N-Step Q learning - if game_overs[-1]: - R = 0 - else: - R = np.max(self.main_network.target_network.predict(last_sample(next_states))) - - for i in reversed(range(num_transitions)): - R = rewards[i] + self.tp.agent.discount * R - state_value_head_targets[i][actions[i]] = R - - else: - assert True, 'The available values for targets_horizon are: 1-Step, N-Step' - - # train - result = self.main_network.online_network.accumulate_gradients(current_states, [state_value_head_targets]) - - # logging - total_loss, losses, unclipped_grads = result[:3] - self.unclipped_grads.add_sample(unclipped_grads) - self.value_loss.add_sample(losses[0]) - - return total_loss - - def train(self): - # update the target network of every network that has a target network - if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: - for network in self.networks: - network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target) - logger.create_signal_value('Update Target Network', 1) - else: - logger.create_signal_value('Update Target Network', 0, overwrite=False) - - return PolicyOptimizationAgent.train(self) diff --git a/agents/naf_agent.py b/agents/naf_agent.py deleted file mode 100644 index 65ca83c..0000000 --- a/agents/naf_agent.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np - -from agents.value_optimization_agent import ValueOptimizationAgent -from utils import RunPhase, Signal - - -# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf -class NAFAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.l_values = Signal("L") - self.a_values = Signal("Advantage") - self.mu_values = Signal("Action") - self.v_values = Signal("V") - self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values] - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # TD error = r + discount*v_st_plus_1 - q_st - v_st_plus_1 = self.main_network.target_network.predict( - next_states, - self.main_network.target_network.output_heads[0].V, - squeeze_output=False, - ) - TD_targets = np.expand_dims(rewards, -1) + (1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * v_st_plus_1 - - if len(actions.shape) == 1: - actions = np.expand_dims(actions, -1) - - result = self.main_network.train_and_sync_networks({**current_states, 'output_0_0': actions}, TD_targets) - total_loss = result[0] - - return total_loss - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - assert not self.env.discrete_controls, 'NAF works only for continuous control problems' - - # convert to batch so we can run it through the network - # observation = np.expand_dims(np.array(curr_state['observation']), 0) - naf_head = self.main_network.online_network.output_heads[0] - action_values = self.main_network.online_network.predict( - self.tf_input_state(curr_state), - outputs=naf_head.mu, - squeeze_output=False, - ) - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = action_values - - Q, L, A, mu, V = self.main_network.online_network.predict( - {**self.tf_input_state(curr_state), 'output_0_0': action_values}, - outputs=[naf_head.Q, naf_head.L, naf_head.A, naf_head.mu, naf_head.V], - ) - - # store the q values statistics for logging - self.q_values.add_sample(Q) - self.l_values.add_sample(L) - self.a_values.add_sample(A) - self.mu_values.add_sample(mu) - self.v_values.add_sample(V) - - action_value = {"action_value": Q} - return action, action_value diff --git a/agents/nec_agent.py b/agents/nec_agent.py deleted file mode 100644 index a327db4..0000000 --- a/agents/nec_agent.py +++ /dev/null @@ -1,96 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np -import os, pickle -from agents.value_optimization_agent import ValueOptimizationAgent -from logger import screen -from utils import RunPhase - - -# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf -class NECAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=False) - self.current_episode_state_embeddings = [] - self.training_started = False - - def learn_from_batch(self, batch): - if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn): - return 0 - else: - if not self.training_started: - self.training_started = True - screen.log_title("Finished collecting initial entries in DND. Starting to train network...") - - current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - - TD_targets = self.main_network.online_network.predict(current_states) - - # only update the action that we have actually done in this transition - for i in range(self.tp.batch_size): - TD_targets[i, actions[i]] = total_return[i] - - # train the neural network - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - - total_loss = result[0] - - return total_loss - - def act(self, phase=RunPhase.TRAIN): - if self.in_heatup: - # get embedding in heatup (otherwise we get it through choose_action) - embedding = self.main_network.online_network.predict( - self.tf_input_state(self.curr_state), - outputs=self.main_network.online_network.state_embedding) - self.current_episode_state_embeddings.append(embedding) - - return super().act(phase) - - def get_prediction(self, curr_state): - # get the actions q values and the state embedding - embedding, actions_q_values = self.main_network.online_network.predict( - self.tf_input_state(curr_state), - outputs=[self.main_network.online_network.state_embedding, - self.main_network.online_network.output_heads[0].output] - ) - - # store the state embedding for inserting it to the DND later - self.current_episode_state_embeddings.append(embedding.squeeze()) - actions_q_values = actions_q_values[0][0] - return actions_q_values - - def reset_game(self, do_not_reset_env=False): - super().reset_game(do_not_reset_env) - - # get the last full episode that we have collected - episode = self.memory.get_last_complete_episode() - if episode is not None: - # the indexing is only necessary because the heatup can end in the middle of an episode - # this won't be required after fixing this so that when the heatup is ended, the episode is closed - returns = episode.get_transitions_attribute('total_return')[:len(self.current_episode_state_embeddings)] - actions = episode.get_transitions_attribute('action')[:len(self.current_episode_state_embeddings)] - self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings, - actions, returns) - - self.current_episode_state_embeddings = [] - - def save_model(self, model_id): - self.main_network.save_model(model_id) - with open(os.path.join(self.tp.save_model_dir, str(model_id) + '.dnd'), 'wb') as f: - pickle.dump(self.main_network.online_network.output_heads[0].DND, f, pickle.HIGHEST_PROTOCOL) diff --git a/agents/pal_agent.py b/agents/pal_agent.py deleted file mode 100644 index 68ff675..0000000 --- a/agents/pal_agent.py +++ /dev/null @@ -1,65 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf -class PALAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.alpha = tuning_parameters.agent.pal_alpha - self.persistent = tuning_parameters.agent.persistent_advantage_learning - self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - - selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1) - - # next state values - q_st_plus_1_target = self.main_network.target_network.predict(next_states) - v_st_plus_1_target = np.max(q_st_plus_1_target, 1) - - # current state values according to online network - q_st_online = self.main_network.online_network.predict(current_states) - - # current state values according to target network - q_st_target = self.main_network.target_network.predict(current_states) - v_st_target = np.max(q_st_target, 1) - - # calculate TD error - TD_targets = np.copy(q_st_online) - for i in range(self.tp.batch_size): - TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * \ - q_st_plus_1_target[i][selected_actions[i]] - advantage_learning_update = v_st_target[i] - q_st_target[i, actions[i]] - next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]] - # Persistent Advantage Learning or Regular Advantage Learning - if self.persistent: - TD_targets[i, actions[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update) - else: - TD_targets[i, actions[i]] -= self.alpha * advantage_learning_update - - # mixing monte carlo updates - monte_carlo_target = total_return[i] - TD_targets[i, actions[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, actions[i]] \ - + self.monte_carlo_mixing_rate * monte_carlo_target - - result = self.main_network.train_and_sync_networks(current_states, TD_targets) - total_loss = result[0] - - return total_loss diff --git a/agents/policy_gradients_agent.py b/agents/policy_gradients_agent.py deleted file mode 100644 index 3a592d1..0000000 --- a/agents/policy_gradients_agent.py +++ /dev/null @@ -1,93 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.policy_optimization_agent import * -import numpy as np -from logger import * -import tensorflow as tf -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") - -from utils import * - - -class PolicyGradientsAgent(PolicyOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.returns_mean = Signal('Returns Mean') - self.returns_variance = Signal('Returns Variance') - self.signals.append(self.returns_mean) - self.signals.append(self.returns_variance) - self.last_gradient_update_step_idx = 0 - - def learn_from_batch(self, batch): - # batch contains a list of episodes to learn from - current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch) - - for i in reversed(range(len(total_returns))): - if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN: - total_returns[i] = total_returns[0] - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN: - # just take the total return as it is - pass - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE: - # we can get a single transition episode while playing Doom Basic, causing the std to be 0 - if self.std_discounted_return != 0: - total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return - else: - total_returns[i] = 0 - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP: - total_returns[i] -= self.mean_return_over_multiple_episodes[i] - else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") - - targets = total_returns - if not self.env.discrete_controls and len(actions.shape) < 2: - actions = np.expand_dims(actions, -1) - - self.returns_mean.add_sample(np.mean(total_returns)) - self.returns_variance.add_sample(np.std(total_returns)) - - result = self.main_network.online_network.accumulate_gradients({**current_states, 'output_0_0': actions}, targets) - total_loss = result[0] - - return total_loss - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - # convert to batch so we can run it through the network - if self.env.discrete_controls: - # DISCRETE - action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = np.argmax(action_values) - action_value = {"action_probability": action_values[action]} - self.entropy.add_sample(-np.sum(action_values * np.log(action_values + eps))) - else: - # CONTINUOUS - result = self.main_network.online_network.predict(self.tf_input_state(curr_state)) - action_values = result[0].squeeze() - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = action_values - action_value = {} - - return action, action_value diff --git a/agents/policy_optimization_agent.py b/agents/policy_optimization_agent.py deleted file mode 100644 index be23760..0000000 --- a/agents/policy_optimization_agent.py +++ /dev/null @@ -1,123 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.agent import * -from memories.memory import Episode - - -class PolicyGradientRescaler(Enum): - TOTAL_RETURN = 0 - FUTURE_RETURN = 1 - FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2 - FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined - Q_VALUE = 4 - A_VALUE = 5 - TD_RESIDUAL = 6 - DISCOUNTED_TD_RESIDUAL = 7 - GAE = 8 - - -class PolicyOptimizationAgent(Agent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) - self.networks.append(self.main_network) - - self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler) - - # statistics for variance reduction - self.last_gradient_update_step_idx = 0 - self.max_episode_length = 100000 - self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length) - self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length) - self.entropy = Signal('Entropy') - self.signals.append(self.entropy) - - self.reset_game(do_not_reset_env=True) - - def log_to_screen(self, phase): - # log to screen - if self.current_episode > 0: - screen.log_dict( - OrderedDict([ - ("Worker", self.task_id), - ("Episode", self.current_episode), - ("total reward", self.total_reward_in_current_episode), - ("steps", self.total_steps_counter), - ("training iteration", self.training_iteration) - ]), - prefix=phase - ) - - def update_episode_statistics(self, episode): - episode_discounted_returns = [] - for i in range(episode.length()): - transition = episode.get_transition(i) - episode_discounted_returns.append(transition.total_return) - self.num_episodes_where_step_has_been_seen[i] += 1 - self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \ - self.num_episodes_where_step_has_been_seen[i] - self.mean_return_over_multiple_episodes[i] += transition.total_return / \ - self.num_episodes_where_step_has_been_seen[i] - self.mean_discounted_return = np.mean(episode_discounted_returns) - self.std_discounted_return = np.std(episode_discounted_returns) - - def train(self): - if self.memory.length() == 0: - return 0 - - episode = self.memory.get_episode(0) - - # check if we should calculate gradients or skip - episode_ended = self.memory.num_complete_episodes() >= 1 - num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx - is_t_max_steps_passed = num_steps_passed_since_last_update >= self.tp.agent.num_steps_between_gradient_updates - if not (is_t_max_steps_passed or episode_ended): - return 0 - - total_loss = 0 - if num_steps_passed_since_last_update > 0: - - # we need to update the returns of the episode until now - episode.update_returns(self.tp.agent.discount) - - # get t_max transitions or less if the we got to a terminal state - # will be used for both actor-critic and vanilla PG. - # # In order to get full episodes, Vanilla PG will set the end_idx to a very big value. - transitions = [] - start_idx = self.last_gradient_update_step_idx - end_idx = episode.length() - - for idx in range(start_idx, end_idx): - transitions.append(episode.get_transition(idx)) - self.last_gradient_update_step_idx = end_idx - - # update the statistics for the variance reduction techniques - if self.tp.agent.type == 'PolicyGradientsAgent': - self.update_episode_statistics(episode) - - # accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes - total_loss = self.learn_from_batch(transitions) - if self.current_episode % self.tp.agent.apply_gradients_every_x_episodes == 0: - self.main_network.apply_gradients_and_sync_networks() - - # move the pointer to the next episode start and discard the episode. we use it only once - if episode_ended: - self.memory.remove_episode(0) - self.last_gradient_update_step_idx = 0 - - return total_loss diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py deleted file mode 100644 index 4a37e69..0000000 --- a/agents/ppo_agent.py +++ /dev/null @@ -1,289 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.actor_critic_agent import * -from random import shuffle - - -# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf -class PPOAgent(ActorCriticAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) - self.critic_network = self.main_network - - # define the policy network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.PPO] - tuning_parameters.agent.optimizer_type = 'Adam' - tuning_parameters.agent.l2_regularization = 0 - self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy', - self.replicated_device, self.worker_device) - self.networks.append(self.policy_network) - - # signals definition - self.value_loss = Signal('Value Loss') - self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') - self.signals.append(self.policy_loss) - self.kl_divergence = Signal('KL Divergence') - self.signals.append(self.kl_divergence) - self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') - self.signals.append(self.unclipped_grads) - - self.reset_game(do_not_reset_env=True) - - def fill_advantages(self, batch): - current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch) - - # * Found not to have any impact * - # current_states_with_timestep = self.concat_state_and_timestep(batch) - - current_state_values = self.critic_network.online_network.predict(current_states).squeeze() - - # calculate advantages - advantages = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: - advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: - # get bootstraps - episode_start_idx = 0 - advantages = np.array([]) - # current_state_values[game_overs] = 0 - for idx, game_over in enumerate(game_overs): - if game_over: - # get advantages for the rollout - value_bootstrapping = np.zeros((1,)) - rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping) - - rollout_advantages, _ = \ - self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1], - rollout_state_values) - episode_start_idx = idx + 1 - advantages = np.append(advantages, rollout_advantages) - else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") - - # standardize - advantages = (advantages - np.mean(advantages)) / np.std(advantages) - - for transition, advantage in zip(self.memory.transitions, advantages): - transition.info['advantage'] = advantage - - self.action_advantages.add_sample(advantages) - - def train_value_network(self, dataset, epochs): - loss = [] - current_states, _, _, _, _, total_return = self.extract_batch(dataset) - - # * Found not to have any impact * - # add a timestep to the observation - # current_states_with_timestep = self.concat_state_and_timestep(dataset) - - total_return = np.expand_dims(total_return, -1) - mix_fraction = self.tp.agent.value_targets_mix_fraction - for j in range(epochs): - batch_size = len(dataset) - if self.critic_network.online_network.optimizer_type != 'LBFGS': - batch_size = self.tp.batch_size - for i in range(len(dataset) // batch_size): - # split to batches for first order optimization techniques - current_states_batch = { - k: v[i * batch_size:(i + 1) * batch_size] - for k, v in current_states.items() - } - total_return_batch = total_return[i * batch_size:(i + 1) * batch_size] - old_policy_values = force_list(self.critic_network.target_network.predict( - current_states_batch).squeeze()) - if self.critic_network.online_network.optimizer_type != 'LBFGS': - targets = total_return_batch - else: - current_values = self.critic_network.online_network.predict(current_states_batch) - targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction - - inputs = copy.copy(current_states_batch) - for input_index, input in enumerate(old_policy_values): - name = 'output_0_{}'.format(input_index) - if name in self.critic_network.online_network.inputs: - inputs[name] = input - - value_loss = self.critic_network.online_network.accumulate_gradients(inputs, targets) - self.critic_network.apply_gradients_to_online_network() - if self.tp.distributed: - self.critic_network.apply_gradients_to_global_network() - self.critic_network.online_network.reset_accumulated_gradients() - - loss.append([value_loss[0]]) - loss = np.mean(loss, 0) - return loss - - def concat_state_and_timestep(self, dataset): - current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep']) - for transition in dataset] - current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1) - return current_states_with_timestep - - def train_policy_network(self, dataset, epochs): - loss = [] - for j in range(epochs): - loss = { - 'total_loss': [], - 'policy_losses': [], - 'unclipped_grads': [], - 'fetch_result': [] - } - #shuffle(dataset) - for i in range(len(dataset) // self.tp.batch_size): - batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size] - current_states, _, actions, _, _, total_return = self.extract_batch(batch) - advantages = np.array([t.info['advantage'] for t in batch]) - if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1: - actions = np.expand_dims(actions, -1) - - # get old policy probabilities and distribution - old_policy = force_list(self.policy_network.target_network.predict(current_states)) - - # calculate gradients and apply on both the local policy network and on the global policy network - fetches = [self.policy_network.online_network.output_heads[0].kl_divergence, - self.policy_network.online_network.output_heads[0].entropy] - - inputs = copy.copy(current_states) - # TODO: why is this output 0 and not output 1? - inputs['output_0_0'] = actions - # TODO: does old_policy_distribution really need to be represented as a list? - # A: yes it does, in the event of discrete controls, it has just a mean - # otherwise, it has both a mean and standard deviation - for input_index, input in enumerate(old_policy): - inputs['output_0_{}'.format(input_index + 1)] = input - total_loss, policy_losses, unclipped_grads, fetch_result =\ - self.policy_network.online_network.accumulate_gradients( - inputs, [advantages], additional_fetches=fetches) - - self.policy_network.apply_gradients_to_online_network() - if self.tp.distributed: - self.policy_network.apply_gradients_to_global_network() - - self.policy_network.online_network.reset_accumulated_gradients() - - loss['total_loss'].append(total_loss) - loss['policy_losses'].append(policy_losses) - loss['unclipped_grads'].append(unclipped_grads) - loss['fetch_result'].append(fetch_result) - - self.unclipped_grads.add_sample(unclipped_grads) - - for key in loss.keys(): - loss[key] = np.mean(loss[key], 0) - - if self.tp.learning_rate_decay_rate != 0: - curr_learning_rate = self.main_network.online_network.get_variable_value(self.tp.learning_rate) - self.curr_learning_rate.add_sample(curr_learning_rate) - else: - curr_learning_rate = self.tp.learning_rate - - # log training parameters - screen.log_dict( - OrderedDict([ - ("Surrogate loss", loss['policy_losses'][0]), - ("KL divergence", loss['fetch_result'][0]), - ("Entropy", loss['fetch_result'][1]), - ("training epoch", j), - ("learning_rate", curr_learning_rate) - ]), - prefix="Policy training" - ) - - self.total_kl_divergence_during_training_process = loss['fetch_result'][0] - self.entropy.add_sample(loss['fetch_result'][1]) - self.kl_divergence.add_sample(loss['fetch_result'][0]) - return loss['total_loss'] - - def update_kl_coefficient(self): - # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow - # his implementation for now because we know it works well - screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process)) - - # update kl coefficient - kl_target = self.tp.agent.target_kl_divergence - kl_coefficient = self.policy_network.online_network.get_variable_value( - self.policy_network.online_network.output_heads[0].kl_coefficient) - new_kl_coefficient = kl_coefficient - if self.total_kl_divergence_during_training_process > 1.3 * kl_target: - # kl too high => increase regularization - new_kl_coefficient *= 1.5 - elif self.total_kl_divergence_during_training_process < 0.7 * kl_target: - # kl too low => decrease regularization - new_kl_coefficient /= 1.5 - - # update the kl coefficient variable - if kl_coefficient != new_kl_coefficient: - self.policy_network.online_network.set_variable_value( - self.policy_network.online_network.output_heads[0].assign_kl_coefficient, - new_kl_coefficient, - self.policy_network.online_network.output_heads[0].kl_coefficient_ph) - - screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) - - def post_training_commands(self): - if self.tp.agent.use_kl_regularization: - self.update_kl_coefficient() - - # clean memory - self.memory.clean() - - def train(self): - self.policy_network.sync() - self.critic_network.sync() - - dataset = self.memory.transitions - - self.fill_advantages(dataset) - - # take only the requested number of steps - dataset = dataset[:self.tp.agent.num_consecutive_playing_steps] - - value_loss = self.train_value_network(dataset, 1) - policy_loss = self.train_policy_network(dataset, 10) - - self.value_loss.add_sample(value_loss) - self.policy_loss.add_sample(policy_loss) - self.update_log() # should be done in order to update the data that has been accumulated * while not playing * - return np.append(value_loss, policy_loss) - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - if self.env.discrete_controls: - # DISCRETE - action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - - if phase == RunPhase.TRAIN: - action = self.exploration_policy.get_action(action_values) - else: - action = np.argmax(action_values) - action_info = {"action_probability": action_values[action]} - # self.entropy.add_sample(-np.sum(action_values * np.log(action_values))) - else: - # CONTINUOUS - action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state)) - action_values_mean = action_values_mean.squeeze() - action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: - action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) - else: - action = action_values_mean - action_info = {"action_probability": action_values_mean} - - return action, action_info diff --git a/agents/qr_dqn_agent.py b/agents/qr_dqn_agent.py deleted file mode 100644 index 8888d18..0000000 --- a/agents/qr_dqn_agent.py +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from agents.value_optimization_agent import * - - -# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf -class QuantileRegressionDQNAgent(ValueOptimizationAgent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms) - - # prediction's format is (batch,actions,atoms) - def get_q_values(self, quantile_values): - return np.dot(quantile_values, self.quantile_probabilities) - - def learn_from_batch(self, batch): - current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) - - # get the quantiles of the next states and current states - next_state_quantiles = self.main_network.target_network.predict(next_states) - current_quantiles = self.main_network.online_network.predict(current_states) - - # get the optimal actions to take for the next states - target_actions = np.argmax(self.get_q_values(next_state_quantiles), axis=1) - - # calculate the Bellman update - batch_idx = list(range(self.tp.batch_size)) - rewards = np.expand_dims(rewards, -1) - game_overs = np.expand_dims(game_overs, -1) - TD_targets = rewards + (1.0 - game_overs) * self.tp.agent.discount \ - * next_state_quantiles[batch_idx, target_actions] - - # get the locations of the selected actions within the batch for indexing purposes - actions_locations = [[b, a] for b, a in zip(batch_idx, actions)] - - # calculate the cumulative quantile probabilities and reorder them to fit the sorted quantiles order - cumulative_probabilities = np.array(range(self.tp.agent.atoms+1))/float(self.tp.agent.atoms) # tau_i - quantile_midpoints = 0.5*(cumulative_probabilities[1:] + cumulative_probabilities[:-1]) # tau^hat_i - quantile_midpoints = np.tile(quantile_midpoints, (self.tp.batch_size, 1)) - sorted_quantiles = np.argsort(current_quantiles[batch_idx, actions]) - for idx in range(self.tp.batch_size): - quantile_midpoints[idx, :] = quantile_midpoints[idx, sorted_quantiles[idx]] - - # train - result = self.main_network.train_and_sync_networks({ - **current_states, - 'output_0_0': actions_locations, - 'output_0_1': quantile_midpoints, - }, TD_targets) - total_loss = result[0] - - return total_loss diff --git a/agents/value_optimization_agent.py b/agents/value_optimization_agent.py deleted file mode 100644 index 75708d7..0000000 --- a/agents/value_optimization_agent.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import numpy as np - -from agents.agent import Agent -from architectures.network_wrapper import NetworkWrapper -from utils import RunPhase, Signal - - -class ValueOptimizationAgent(Agent): - def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) - self.networks.append(self.main_network) - self.q_values = Signal("Q") - self.signals.append(self.q_values) - - self.reset_game(do_not_reset_env=True) - - # Algorithms for which q_values are calculated from predictions will override this function - def get_q_values(self, prediction): - return prediction - - def get_prediction(self, curr_state): - return self.main_network.online_network.predict(self.tf_input_state(curr_state)) - - def _validate_action(self, policy, action): - if np.array(action).shape != (): - raise ValueError(( - 'The exploration_policy {} returned a vector of actions ' - 'instead of a single action. ValueOptimizationAgents ' - 'require exploration policies which return a single action.' - ).format(policy.__class__.__name__)) - - def choose_action(self, curr_state, phase=RunPhase.TRAIN): - prediction = self.get_prediction(curr_state) - actions_q_values = self.get_q_values(prediction) - - # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: - exploration_policy = self.exploration_policy - else: - exploration_policy = self.evaluation_exploration_policy - - action = exploration_policy.get_action(actions_q_values) - self._validate_action(exploration_policy, action) - - # this is for bootstrapped dqn - if type(actions_q_values) == list and len(actions_q_values) > 0: - actions_q_values = actions_q_values[self.exploration_policy.selected_head] - actions_q_values = actions_q_values.squeeze() - - # store the q values statistics for logging - self.q_values.add_sample(actions_q_values) - - # store information for plotting interactively (actual plotting is done in agent) - if self.tp.visualization.plot_action_values_online: - for idx, action_name in enumerate(self.env.actions_description): - self.episode_running_info[action_name].append(actions_q_values[idx]) - - action_value = {"action_value": actions_q_values[action], "max_action_value": np.max(actions_q_values)} - return action, action_value diff --git a/architectures/__init__.py b/architectures/__init__.py deleted file mode 100644 index cbf2ac5..0000000 --- a/architectures/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from architectures.architecture import * -from logger import failed_imports -try: - from architectures.tensorflow_components.general_network import * - from architectures.tensorflow_components.architecture import * -except ImportError: - failed_imports.append("TensorFlow") - -try: - from architectures.neon_components.general_network import * - from architectures.neon_components.architecture import * -except ImportError: - failed_imports.append("Neon") - -from architectures.network_wrapper import * \ No newline at end of file diff --git a/architectures/neon_components/architecture.py b/architectures/neon_components/architecture.py deleted file mode 100644 index de600c1..0000000 --- a/architectures/neon_components/architecture.py +++ /dev/null @@ -1,129 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys -import copy -from ngraph.frontends.neon import * -import ngraph as ng -from architectures.architecture import * -import numpy as np -from utils import * - - -class NeonArchitecture(Architecture): - def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - Architecture.__init__(self, tuning_parameters, name) - assert tuning_parameters.agent.neon_support, 'Neon is not supported for this agent' - self.clip_error = tuning_parameters.clip_gradients - self.total_loss = None - self.epoch = 0 - self.inputs = [] - self.outputs = [] - self.targets = [] - self.losses = [] - - self.transformer = tuning_parameters.sess - self.network = self.get_model(tuning_parameters) - self.accumulated_gradients = [] - - # training and inference ops - train_output = ng.sequential([ - self.optimizer(self.total_loss), - self.total_loss - ]) - placeholders = self.inputs + self.targets - self.train_op = self.transformer.add_computation( - ng.computation( - train_output, *placeholders - ) - ) - self.predict_op = self.transformer.add_computation( - ng.computation( - self.outputs, self.inputs[0] - ) - ) - - # update weights from array op - self.weights = [ng.placeholder(w.axes) for w in self.total_loss.variables()] - self.set_weights_ops = [] - for target_variable, variable in zip(self.total_loss.variables(), self.weights): - self.set_weights_ops.append(self.transformer.add_computation( - ng.computation( - ng.assign(target_variable, variable), variable - ) - )) - - # get weights op - self.get_variables = self.transformer.add_computation( - ng.computation( - self.total_loss.variables() - ) - ) - - def predict(self, inputs): - batch_size = inputs.shape[0] - - # move batch axis to the end - inputs = inputs.swapaxes(0, -1) - prediction = self.predict_op(inputs) # TODO: problem with multiple inputs - - if type(prediction) != tuple: - prediction = (prediction) - - # process all the outputs from the network - output = [] - for p in prediction: - output.append(p.transpose()[:batch_size].copy()) - - # if there is only one output then we don't need a list - if len(output) == 1: - output = output[0] - return output - - def train_on_batch(self, inputs, targets): - loss = self.accumulate_gradients(inputs, targets) - self.apply_and_reset_gradients(self.accumulated_gradients) - return loss - - def get_weights(self): - return self.get_variables() - - def set_weights(self, weights, rate=1.0): - if rate != 1: - current_weights = self.get_weights() - updated_weights = [(1 - rate) * t + rate * o for t, o in zip(current_weights, weights)] - else: - updated_weights = weights - for update_function, variable in zip(self.set_weights_ops, updated_weights): - update_function(variable) - - def accumulate_gradients(self, inputs, targets): - # Neon doesn't currently allow separating the grads calculation and grad apply operations - # so this feature is not currently available. instead we do a full training iteration - inputs = force_list(inputs) - targets = force_list(targets) - - for idx, input in enumerate(inputs): - inputs[idx] = input.swapaxes(0, -1) - - for idx, target in enumerate(targets): - targets[idx] = np.rollaxis(target, 0, len(target.shape)) - - all_inputs = inputs + targets - - loss = np.mean(self.train_op(*all_inputs)) - - return [loss] diff --git a/architectures/neon_components/embedders.py b/architectures/neon_components/embedders.py deleted file mode 100644 index 5f594a3..0000000 --- a/architectures/neon_components/embedders.py +++ /dev/null @@ -1,88 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import ngraph.frontends.neon as neon -import ngraph as ng -from ngraph.util.names import name_scope - - -class InputEmbedder(object): - def __init__(self, input_size, batch_size=None, activation_function=neon.Rectlin(), name="embedder"): - self.name = name - self.input_size = input_size - self.batch_size = batch_size - self.activation_function = activation_function - self.weights_init = neon.GlorotInit() - self.biases_init = neon.ConstantInit() - self.input = None - self.output = None - - def __call__(self, prev_input_placeholder=None): - with name_scope(self.get_name()): - # create the input axes - axes = [] - if len(self.input_size) == 2: - axis_names = ['H', 'W'] - else: - axis_names = ['C', 'H', 'W'] - for axis_size, axis_name in zip(self.input_size, axis_names): - axes.append(ng.make_axis(axis_size, name=axis_name)) - batch_axis_full = ng.make_axis(self.batch_size, name='N') - input_axes = ng.make_axes(axes) - - if prev_input_placeholder is None: - self.input = ng.placeholder(input_axes + [batch_axis_full]) - else: - self.input = prev_input_placeholder - self._build_module() - - return self.input, self.output(self.input) - - def _build_module(self): - pass - - def get_name(self): - return self.name - - -class ImageEmbedder(InputEmbedder): - def __init__(self, input_size, batch_size=None, input_rescaler=255.0, activation_function=neon.Rectlin(), name="embedder"): - InputEmbedder.__init__(self, input_size, batch_size, activation_function, name) - self.input_rescaler = input_rescaler - - def _build_module(self): - # image observation - self.output = neon.Sequential([ - neon.Preprocess(functor=lambda x: x / self.input_rescaler), - neon.Convolution((8, 8, 32), strides=4, activation=self.activation_function, - filter_init=self.weights_init, bias_init=self.biases_init), - neon.Convolution((4, 4, 64), strides=2, activation=self.activation_function, - filter_init=self.weights_init, bias_init=self.biases_init), - neon.Convolution((3, 3, 64), strides=1, activation=self.activation_function, - filter_init=self.weights_init, bias_init=self.biases_init) - ]) - - -class VectorEmbedder(InputEmbedder): - def __init__(self, input_size, batch_size=None, activation_function=neon.Rectlin(), name="embedder"): - InputEmbedder.__init__(self, input_size, batch_size, activation_function, name) - - def _build_module(self): - # vector observation - self.output = neon.Sequential([ - neon.Affine(nout=256, activation=self.activation_function, - weight_init=self.weights_init, bias_init=self.biases_init) - ]) diff --git a/architectures/neon_components/general_network.py b/architectures/neon_components/general_network.py deleted file mode 100644 index 99ac6e9..0000000 --- a/architectures/neon_components/general_network.py +++ /dev/null @@ -1,192 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from architectures.neon_components.embedders import * -from architectures.neon_components.heads import * -from architectures.neon_components.middleware import * -from architectures.neon_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes - - -class GeneralNeonNetwork(NeonArchitecture): - def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - self.global_network = global_network - self.network_is_local = network_is_local - self.num_heads_per_network = 1 if tuning_parameters.agent.use_separate_networks_per_head else \ - len(tuning_parameters.agent.output_types) - self.num_networks = 1 if not tuning_parameters.agent.use_separate_networks_per_head else \ - len(tuning_parameters.agent.output_types) - self.input_embedders = [] - self.output_heads = [] - self.activation_function = self.get_activation_function( - tuning_parameters.agent.hidden_layers_activation_function) - - NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) - - def get_activation_function(self, activation_function_string): - activation_functions = { - 'relu': neon.Rectlin(), - 'tanh': neon.Tanh(), - 'sigmoid': neon.Logistic(), - 'elu': neon.Explin(), - 'selu': None, - 'none': None - } - assert activation_function_string in activation_functions.keys(), \ - "Activation function must be one of the following {}".format(activation_functions.keys()) - return activation_functions[activation_function_string] - - def get_input_embedder(self, embedder_type): - # the observation can be either an image or a vector - def get_observation_embedding(with_timestep=False): - if self.input_height > 1: - return ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size, - name="observation") - else: - return VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size, - name="observation") - - input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), self.batch_size, name="action"), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), - } - return input_mapping[embedder_type] - - def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) - - def get_output_head(self, head_type, head_idx, loss_weight=1.): - output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach - OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach - OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach - OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach - OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach - } - return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) - - def get_model(self, tuning_parameters): - """ - :param tuning_parameters: A Preset class instance with all the running paramaters - :type tuning_parameters: Preset - :return: A model - """ - assert len(self.tp.agent.input_types) > 0, "At least one input type should be defined" - assert len(self.tp.agent.output_types) > 0, "At least one output type should be defined" - assert self.tp.agent.middleware_type is not None, "Exactly one middleware type should be defined" - assert len(self.tp.agent.loss_weights) > 0, "At least one loss weight should be defined" - assert len(self.tp.agent.output_types) == len(self.tp.agent.loss_weights), \ - "Number of loss weights should match the number of output types" - local_network_in_distributed_training = self.global_network is not None and self.network_is_local - - tuning_parameters.activation_function = self.activation_function - done_creating_input_placeholders = False - - for network_idx in range(self.num_networks): - with name_scope('network_{}'.format(network_idx)): - #################### - # Input Embeddings # - #################### - - state_embedding = [] - for idx, input_type in enumerate(self.tp.agent.input_types): - # get the class of the input embedder - self.input_embedders.append(self.get_input_embedder(input_type)) - - # in the case each head uses a different network, we still reuse the input placeholders - prev_network_input_placeholder = self.inputs[idx] if done_creating_input_placeholders else None - - # create the input embedder instance and store the input placeholder and the embedding - input_placeholder, embedding = self.input_embedders[-1](prev_network_input_placeholder) - if len(self.inputs) < len(self.tp.agent.input_types): - self.inputs.append(input_placeholder) - state_embedding.append(embedding) - - done_creating_input_placeholders = True - - ############## - # Middleware # - ############## - - state_embedding = ng.concat_along_axis(state_embedding, state_embedding[0].axes[0]) \ - if len(state_embedding) > 1 else state_embedding[0] - self.middleware_embedder = self.get_middleware_embedder(self.tp.agent.middleware_type) - _, self.state_embedding = self.middleware_embedder(state_embedding) - - ################ - # Output Heads # - ################ - - for head_idx in range(self.num_heads_per_network): - for head_copy_idx in range(self.tp.agent.num_output_head_copies): - if self.tp.agent.use_separate_networks_per_head: - # if we use separate networks per head, then the head type corresponds top the network idx - head_type_idx = network_idx - else: - # if we use a single network with multiple heads, then the head type is the current head idx - head_type_idx = head_idx - self.output_heads.append(self.get_output_head(self.tp.agent.output_types[head_type_idx], - head_copy_idx, - self.tp.agent.loss_weights[head_type_idx])) - if self.network_is_local: - output, target_placeholder, input_placeholder = self.output_heads[-1](self.state_embedding) - self.targets.extend(target_placeholder) - else: - output, input_placeholder = self.output_heads[-1](self.state_embedding) - - self.outputs.extend(output) - self.inputs.extend(input_placeholder) - - # Losses - self.losses = [] - for output_head in self.output_heads: - self.losses += output_head.loss - self.total_loss = sum(self.losses) - - # Learning rate - if self.tp.learning_rate_decay_rate != 0: - raise Exception("learning rate decay is not supported in neon") - - # Optimizer - if local_network_in_distributed_training and \ - hasattr(self.tp.agent, "shared_optimizer") and self.tp.agent.shared_optimizer: - # distributed training and this is the local network instantiation - self.optimizer = self.global_network.optimizer - else: - if tuning_parameters.agent.optimizer_type == 'Adam': - self.optimizer = neon.Adam( - learning_rate=tuning_parameters.learning_rate, - gradient_clip_norm=tuning_parameters.clip_gradients - ) - elif tuning_parameters.agent.optimizer_type == 'RMSProp': - self.optimizer = neon.RMSProp( - learning_rate=tuning_parameters.learning_rate, - gradient_clip_norm=tuning_parameters.clip_gradients, - decay_rate=0.9, - epsilon=0.01 - ) - elif tuning_parameters.agent.optimizer_type == 'LBFGS': - raise Exception("LBFGS optimizer is not supported in neon") - else: - raise Exception("{} is not a valid optimizer type".format(tuning_parameters.agent.optimizer_type)) diff --git a/architectures/neon_components/heads.py b/architectures/neon_components/heads.py deleted file mode 100644 index df49867..0000000 --- a/architectures/neon_components/heads.py +++ /dev/null @@ -1,194 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import ngraph as ng -from ngraph.util.names import name_scope -import ngraph.frontends.neon as neon -import numpy as np -from utils import force_list -from architectures.neon_components.losses import * - - -class Head(object): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - self.head_idx = head_idx - self.name = "head" - self.output = [] - self.loss = [] - self.loss_type = [] - self.regularizations = [] - self.loss_weight = force_list(loss_weight) - self.weights_init = neon.GlorotInit() - self.biases_init = neon.ConstantInit() - self.target = [] - self.input = [] - self.is_local = is_local - self.batch_size = tuning_parameters.batch_size - - def __call__(self, input_layer): - """ - Wrapper for building the module graph including scoping and loss creation - :param input_layer: the input to the graph - :return: the output of the last layer and the target placeholder - """ - with name_scope(self.get_name()): - self._build_module(input_layer) - - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) - if self.is_local: - self.set_loss() - - if self.is_local: - return self.output, self.target, self.input - else: - return self.output, self.input - - def _build_module(self, input_layer): - """ - Builds the graph of the module - :param input_layer: the input to the graph - :return: None - """ - pass - - def get_name(self): - """ - Get a formatted name for the module - :return: the formatted name - """ - return '{}_{}'.format(self.name, self.head_idx) - - def set_loss(self): - """ - Creates a target placeholder and loss function for each loss_type and regularization - :param loss_type: a tensorflow loss function - :param scope: the name scope to include the tensors in - :return: None - """ - # add losses and target placeholder - for idx in range(len(self.loss_type)): - # output_axis = ng.make_axis(self.num_actions, name='q_values') - batch_axis_full = ng.make_axis(self.batch_size, name='N') - target = ng.placeholder(ng.make_axes([self.output[0].axes[0], batch_axis_full])) - self.target.append(target) - loss = self.loss_type[idx](self.target[-1], self.output[idx], - weights=self.loss_weight[idx], scope=self.get_name()) - self.loss.append(loss) - - # add regularizations - for regularization in self.regularizations: - self.loss.append(regularization) - - -class QHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'q_values_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - if tuning_parameters.agent.replace_mse_with_huber_loss: - raise Exception("huber loss is not supported in neon") - else: - self.loss_type = mean_squared_error - - def _build_module(self, input_layer): - # Standard Q Network - self.output = neon.Sequential([ - neon.Affine(nout=self.num_actions, - weight_init=self.weights_init, bias_init=self.biases_init) - ])(input_layer) - - -class DuelingQHead(QHead): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - QHead.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - - def _build_module(self, input_layer): - # Dueling Network - # state value tower - V - output_axis = ng.make_axis(self.num_actions, name='q_values') - - state_value = neon.Sequential([ - neon.Affine(nout=256, activation=neon.Rectlin(), - weight_init=self.weights_init, bias_init=self.biases_init), - neon.Affine(nout=1, - weight_init=self.weights_init, bias_init=self.biases_init) - ])(input_layer) - - # action advantage tower - A - action_advantage_unnormalized = neon.Sequential([ - neon.Affine(nout=256, activation=neon.Rectlin(), - weight_init=self.weights_init, bias_init=self.biases_init), - neon.Affine(axes=output_axis, - weight_init=self.weights_init, bias_init=self.biases_init) - ])(input_layer) - action_advantage = action_advantage_unnormalized - ng.mean(action_advantage_unnormalized) - - repeated_state_value = ng.expand_dims(ng.slice_along_axis(state_value, state_value.axes[0], 0), output_axis, 0) - - # merge to state-action value function Q - self.output = repeated_state_value + action_advantage - - -class MeasurementsPredictionHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'future_measurements_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.num_measurements = tuning_parameters.env.measurements_size[0] \ - if tuning_parameters.env.measurements_size else 0 - self.num_prediction_steps = tuning_parameters.agent.num_predicted_steps_ahead - self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps - if tuning_parameters.agent.replace_mse_with_huber_loss: - raise Exception("huber loss is not supported in neon") - else: - self.loss_type = mean_squared_error - - def _build_module(self, input_layer): - # This is almost exactly the same as Dueling Network but we predict the future measurements for each action - - multistep_measurements_size = self.measurements_size[0] * self.num_predicted_steps_ahead - - # actions expectation tower (expectation stream) - E - with name_scope("expectation_stream"): - expectation_stream = neon.Sequential([ - neon.Affine(nout=256, activation=neon.Rectlin(), - weight_init=self.weights_init, bias_init=self.biases_init), - neon.Affine(nout=multistep_measurements_size, - weight_init=self.weights_init, bias_init=self.biases_init) - ])(input_layer) - - # action fine differences tower (action stream) - A - with name_scope("action_stream"): - action_stream_unnormalized = neon.Sequential([ - neon.Affine(nout=256, activation=neon.Rectlin(), - weight_init=self.weights_init, bias_init=self.biases_init), - neon.Affine(nout=self.num_actions * multistep_measurements_size, - weight_init=self.weights_init, bias_init=self.biases_init), - neon.Reshape((self.num_actions, multistep_measurements_size)) - ])(input_layer) - action_stream = action_stream_unnormalized - ng.mean(action_stream_unnormalized) - - repeated_expectation_stream = ng.slice_along_axis(expectation_stream, expectation_stream.axes[0], 0) - repeated_expectation_stream = ng.expand_dims(repeated_expectation_stream, output_axis, 0) - - # merge to future measurements predictions - self.output = repeated_expectation_stream + action_stream - diff --git a/architectures/neon_components/middleware.py b/architectures/neon_components/middleware.py deleted file mode 100644 index 2aa02fd..0000000 --- a/architectures/neon_components/middleware.py +++ /dev/null @@ -1,50 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import ngraph as ng -import ngraph.frontends.neon as neon -from ngraph.util.names import name_scope -import numpy as np - - -class MiddlewareEmbedder(object): - def __init__(self, activation_function=neon.Rectlin(), name="middleware_embedder"): - self.name = name - self.input = None - self.output = None - self.weights_init = neon.GlorotInit() - self.biases_init = neon.ConstantInit() - self.activation_function = activation_function - - def __call__(self, input_layer): - with name_scope(self.get_name()): - self.input = input_layer - self._build_module() - - return self.input, self.output(self.input) - - def _build_module(self): - pass - - def get_name(self): - return self.name - - -class FC_Embedder(MiddlewareEmbedder): - def _build_module(self): - self.output = neon.Sequential([ - neon.Affine(nout=512, activation=self.activation_function, - weight_init=self.weights_init, bias_init=self.biases_init)]) diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py deleted file mode 100644 index 7388587..0000000 --- a/architectures/network_wrapper.py +++ /dev/null @@ -1,187 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from collections import OrderedDict -from configurations import Preset, Frameworks -from logger import * -try: - import tensorflow as tf - from architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork -except ImportError: - failed_imports.append("TensorFlow") - -try: - from architectures.neon_components.general_network import GeneralNeonNetwork -except ImportError: - failed_imports.append("Neon") - - -class NetworkWrapper(object): - """ - Contains multiple networks and managers syncing and gradient updates - between them. - """ - def __init__(self, tuning_parameters, has_target, has_global, name, replicated_device=None, worker_device=None): - """ - :param tuning_parameters: - :type tuning_parameters: Preset - :param has_target: - :param has_global: - :param name: - :param replicated_device: - :param worker_device: - """ - self.tp = tuning_parameters - self.has_target = has_target - self.has_global = has_global - self.name = name - self.sess = tuning_parameters.sess - - if self.tp.framework == Frameworks.TensorFlow: - general_network = GeneralTensorFlowNetwork - elif self.tp.framework == Frameworks.Neon: - general_network = GeneralNeonNetwork - else: - raise Exception("{} Framework is not supported".format(Frameworks().to_string(self.tp.framework))) - - # Global network - the main network shared between threads - self.global_network = None - if self.has_global: - with tf.device(replicated_device): - self.global_network = general_network(tuning_parameters, '{}/global'.format(name), - network_is_local=False) - - # Online network - local copy of the main network used for playing - self.online_network = None - with tf.device(worker_device): - self.online_network = general_network(tuning_parameters, '{}/online'.format(name), - self.global_network, network_is_local=True) - - # Target network - a local, slow updating network used for stabilizing the learning - self.target_network = None - if self.has_target: - with tf.device(worker_device): - self.target_network = general_network(tuning_parameters, '{}/target'.format(name), - network_is_local=True) - - if not self.tp.distributed and self.tp.framework == Frameworks.TensorFlow: - variables_to_restore = tf.global_variables() - variables_to_restore = [v for v in variables_to_restore if '/online' in v.name] - self.model_saver = tf.train.Saver(variables_to_restore) - #, max_to_keep=None) # uncomment to unlimit number of stored checkpoints - if self.tp.sess and self.tp.checkpoint_restore_dir: - checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir) - screen.log_title("Loading checkpoint: {}".format(checkpoint)) - self.model_saver.restore(self.tp.sess, checkpoint) - self.update_target_network() - - def sync(self): - """ - Initializes the weights of the networks to match each other - :return: - """ - self.update_online_network() - self.update_target_network() - - def update_target_network(self, rate=1.0): - """ - Copy weights: online network >>> target network - :param rate: the rate of copying the weights - 1 for copying exactly - """ - if self.target_network: - self.target_network.set_weights(self.online_network.get_weights(), rate) - - def update_online_network(self, rate=1.0): - """ - Copy weights: global network >>> online network - :param rate: the rate of copying the weights - 1 for copying exactly - """ - if self.global_network: - self.online_network.set_weights(self.global_network.get_weights(), rate) - - def apply_gradients_to_global_network(self): - """ - Apply gradients from the online network on the global network - :return: - """ - self.global_network.apply_gradients(self.online_network.accumulated_gradients) - - def apply_gradients_to_online_network(self): - """ - Apply gradients from the online network on itself - :return: - """ - self.online_network.apply_gradients(self.online_network.accumulated_gradients) - - def train_and_sync_networks(self, inputs, targets, additional_fetches=[]): - """ - A generic training function that enables multi-threading training using a global network if necessary. - :param inputs: The inputs for the network. - :param targets: The targets corresponding to the given inputs - :param additional_fetches: Any additional tensor the user wants to fetch - :return: The loss of the training iteration - """ - result = self.online_network.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches) - self.apply_gradients_and_sync_networks() - return result - - def apply_gradients_and_sync_networks(self): - """ - Applies the gradients accumulated in the online network to the global network or to itself and syncs the - networks if necessary - """ - if self.global_network: - self.apply_gradients_to_global_network() - self.online_network.reset_accumulated_gradients() - self.update_online_network() - else: - self.online_network.apply_and_reset_gradients(self.online_network.accumulated_gradients) - - def get_local_variables(self): - """ - Get all the variables that are local to the thread - :return: a list of all the variables that are local to the thread - """ - local_variables = [v for v in tf.global_variables() if self.online_network.name in v.name] - if self.has_target: - local_variables += [v for v in tf.global_variables() if self.target_network.name in v.name] - return local_variables - - def get_global_variables(self): - """ - Get all the variables that are shared between threads - :return: a list of all the variables that are shared between threads - """ - global_variables = [v for v in tf.global_variables() if self.global_network.name in v.name] - return global_variables - - def set_session(self, sess): - self.sess = sess - self.online_network.sess = sess - if self.global_network: - self.global_network.sess = sess - if self.target_network: - self.target_network.sess = sess - - def save_model(self, model_id): - saved_model_path = self.model_saver.save(self.tp.sess, os.path.join(self.tp.save_model_dir, - str(model_id) + '.ckpt')) - screen.log_dict( - OrderedDict([ - ("Saving model", saved_model_path), - ]), - prefix="Checkpoint" - ) diff --git a/architectures/tensorflow_components/architecture.py b/architectures/tensorflow_components/architecture.py deleted file mode 100644 index 006ed2c..0000000 --- a/architectures/tensorflow_components/architecture.py +++ /dev/null @@ -1,367 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import time - -import numpy as np -import tensorflow as tf - -from architectures.architecture import Architecture -from utils import force_list, squeeze_list -from configurations import Preset, MiddlewareTypes - -def variable_summaries(var): - """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" - with tf.name_scope('summaries'): - layer_weight_name = '_'.join(var.name.split('/')[-3:])[:-2] - - with tf.name_scope(layer_weight_name): - mean = tf.reduce_mean(var) - tf.summary.scalar('mean', mean) - with tf.name_scope('stddev'): - stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) - tf.summary.scalar('stddev', stddev) - tf.summary.scalar('max', tf.reduce_max(var)) - tf.summary.scalar('min', tf.reduce_min(var)) - tf.summary.histogram('histogram', var) - -class TensorFlowArchitecture(Architecture): - def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - """ - :param tuning_parameters: The parameters used for running the algorithm - :type tuning_parameters: Preset - :param name: The name of the network - """ - Architecture.__init__(self, tuning_parameters, name) - self.middleware_embedder = None - self.network_is_local = network_is_local - assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent' - self.sess = tuning_parameters.sess - self.inputs = {} - self.outputs = [] - self.targets = [] - self.losses = [] - self.total_loss = None - self.trainable_weights = [] - self.weights_placeholders = [] - self.curr_rnn_c_in = None - self.curr_rnn_h_in = None - self.gradients_wrt_inputs = [] - self.train_writer = None - - self.optimizer_type = self.tp.agent.optimizer_type - if self.tp.seed is not None: - tf.set_random_seed(self.tp.seed) - with tf.variable_scope(self.name, initializer=tf.contrib.layers.xavier_initializer()): - self.global_step = tf.train.get_or_create_global_step() - - # build the network - self.get_model(tuning_parameters) - - # model weights - self.trainable_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) - - # locks for synchronous training - if self.tp.distributed and not self.tp.agent.async_training and not self.network_is_local: - self.lock_counter = tf.get_variable("lock_counter", [], tf.int32, - initializer=tf.constant_initializer(0, dtype=tf.int32), - trainable=False) - self.lock = self.lock_counter.assign_add(1, use_locking=True) - self.lock_init = self.lock_counter.assign(0) - - self.release_counter = tf.get_variable("release_counter", [], tf.int32, - initializer=tf.constant_initializer(0, dtype=tf.int32), - trainable=False) - self.release = self.release_counter.assign_add(1, use_locking=True) - self.release_init = self.release_counter.assign(0) - - # local network does the optimization so we need to create all the ops we are going to use to optimize - for idx, var in enumerate(self.trainable_weights): - placeholder = tf.placeholder(tf.float32, shape=var.get_shape(), name=str(idx) + '_holder') - self.weights_placeholders.append(placeholder) - if self.tp.visualization.tensorboard: - variable_summaries(var) - - self.update_weights_from_list = [weights.assign(holder) for holder, weights in - zip(self.weights_placeholders, self.trainable_weights)] - - # gradients ops - self.tensor_gradients = tf.gradients(self.total_loss, self.trainable_weights) - self.gradients_norm = tf.global_norm(self.tensor_gradients) - if self.tp.clip_gradients is not None and self.tp.clip_gradients != 0: - self.clipped_grads, self.grad_norms = tf.clip_by_global_norm(self.tensor_gradients, - tuning_parameters.clip_gradients) - - # gradients of the outputs w.r.t. the inputs - # at the moment, this is only used by ddpg - if len(self.outputs) == 1: - self.gradients_wrt_inputs = {name: tf.gradients(self.outputs[0], input_ph) for name, input_ph in self.inputs.items()} - self.gradients_weights_ph = tf.placeholder('float32', self.outputs[0].shape, 'output_gradient_weights') - self.weighted_gradients = tf.gradients(self.outputs[0], self.trainable_weights, self.gradients_weights_ph) - - # L2 regularization - if self.tp.agent.l2_regularization != 0: - self.l2_regularization = [tf.add_n([tf.nn.l2_loss(v) for v in self.trainable_weights]) - * self.tp.agent.l2_regularization] - tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.l2_regularization) - - self.inc_step = self.global_step.assign_add(1) - - # defining the optimization process (for LBFGS we have less control over the optimizer) - if self.optimizer_type != 'LBFGS': - # no global network, this is a plain simple centralized training - self.update_weights_from_batch_gradients = self.optimizer.apply_gradients( - zip(self.weights_placeholders, self.trainable_weights), global_step=self.global_step) - - if self.tp.visualization.tensorboard: - current_scope_summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, - scope=tf.contrib.framework.get_name_scope()) - self.merged = tf.summary.merge(current_scope_summaries) - - # initialize or restore model - if not self.tp.distributed: - # Merge all the summaries - - self.init_op = tf.global_variables_initializer() - - if self.sess: - if self.tp.visualization.tensorboard: - # Write the merged summaries to the current experiment directory - self.train_writer = tf.summary.FileWriter(self.tp.experiment_path + '/tensorboard', - self.sess.graph) - self.sess.run(self.init_op) - - self.accumulated_gradients = None - - def reset_accumulated_gradients(self): - """ - Reset the gradients accumulation placeholder - """ - if self.accumulated_gradients is None: - self.accumulated_gradients = self.tp.sess.run(self.trainable_weights) - - for ix, grad in enumerate(self.accumulated_gradients): - self.accumulated_gradients[ix] = grad * 0 - - def accumulate_gradients(self, inputs, targets, additional_fetches=None): - """ - Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation - placeholders - :param additional_fetches: Optional tensors to fetch during gradients calculation - :param inputs: The input batch for the network - :param targets: The targets corresponding to the input batch - :return: A list containing the total loss and the individual network heads losses - """ - - if self.accumulated_gradients is None: - self.reset_accumulated_gradients() - - # feed inputs - if additional_fetches is None: - additional_fetches = [] - - feed_dict = self._feed_dict(inputs) - - # feed targets - targets = force_list(targets) - for placeholder_idx, target in enumerate(targets): - feed_dict[self.targets[placeholder_idx]] = target - - if self.optimizer_type != 'LBFGS': - # set the fetches - fetches = [self.gradients_norm] - if self.tp.clip_gradients: - fetches.append(self.clipped_grads) - else: - fetches.append(self.tensor_gradients) - fetches += [self.total_loss, self.losses] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: - fetches.append(self.middleware_embedder.state_out) - additional_fetches_start_idx = len(fetches) - fetches += additional_fetches - - # feed the lstm state if necessary - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: - # we can't always assume that we are starting from scratch here can we? - feed_dict[self.middleware_embedder.c_in] = self.middleware_embedder.c_init - feed_dict[self.middleware_embedder.h_in] = self.middleware_embedder.h_init - - if self.tp.visualization.tensorboard: - fetches += [self.merged] - - # get grads - result = self.tp.sess.run(fetches, feed_dict=feed_dict) - if hasattr(self, 'train_writer') and self.train_writer is not None: - self.train_writer.add_summary(result[-1], self.tp.current_episode) - - # extract the fetches - norm_unclipped_grads, grads, total_loss, losses = result[:4] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: - (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4] - fetched_tensors = [] - if len(additional_fetches) > 0: - fetched_tensors = result[additional_fetches_start_idx:additional_fetches_start_idx + - len(additional_fetches)] - - # accumulate the gradients - for idx, grad in enumerate(grads): - self.accumulated_gradients[idx] += grad - - return total_loss, losses, norm_unclipped_grads, fetched_tensors - - else: - self.optimizer.minimize(session=self.tp.sess, feed_dict=feed_dict) - - return [0] - - def apply_and_reset_gradients(self, gradients, scaler=1.): - """ - Applies the given gradients to the network weights and resets the accumulation placeholder - :param gradients: The gradients to use for the update - :param scaler: A scaling factor that allows rescaling the gradients before applying them - """ - self.apply_gradients(gradients, scaler) - self.reset_accumulated_gradients() - - def apply_gradients(self, gradients, scaler=1.): - """ - Applies the given gradients to the network weights - :param gradients: The gradients to use for the update - :param scaler: A scaling factor that allows rescaling the gradients before applying them - """ - if self.tp.agent.async_training or not self.tp.distributed: - if hasattr(self, 'global_step') and not self.network_is_local: - self.tp.sess.run(self.inc_step) - - if self.optimizer_type != 'LBFGS': - - # lock barrier - if hasattr(self, 'lock_counter'): - self.tp.sess.run(self.lock) - while self.tp.sess.run(self.lock_counter) % self.tp.num_threads != 0: - time.sleep(0.00001) - # rescale the gradients so that they average out with the gradients from the other workers - scaler /= float(self.tp.num_threads) - - # apply gradients - if scaler != 1.: - for gradient in gradients: - gradient /= scaler - feed_dict = dict(zip(self.weights_placeholders, gradients)) - _ = self.tp.sess.run(self.update_weights_from_batch_gradients, feed_dict=feed_dict) - - # release barrier - if hasattr(self, 'release_counter'): - self.tp.sess.run(self.release) - while self.tp.sess.run(self.release_counter) % self.tp.num_threads != 0: - time.sleep(0.00001) - - def _feed_dict(self, inputs): - feed_dict = {} - for input_name, input_value in inputs.items(): - if isinstance(input_name, str): - if input_name not in self.inputs: - raise ValueError(( - 'input name {input_name} was provided to create a feed ' - 'dictionary, but there is no placeholder with that name. ' - 'placeholder names available include: {placeholder_names}' - ).format( - input_name=input_name, - placeholder_names=', '.join(self.inputs.keys()) - )) - - feed_dict[self.inputs[input_name]] = input_value - elif isinstance(input_name, tf.Tensor) and input_name.op.type == 'Placeholder': - feed_dict[input_name] = input_value - else: - raise ValueError(( - 'input dictionary expects strings or placeholders as keys, ' - 'but found key {key} of type {type}' - ).format( - key=input_name, - type=type(input_name), - )) - - return feed_dict - - def predict(self, inputs, outputs=None, squeeze_output=True): - """ - Run a forward pass of the network using the given input - :param inputs: The input for the network - :param outputs: The output for the network, defaults to self.outputs - :param squeeze_output: call squeeze_list on output - :return: The network output - - WARNING: must only call once per state since each call is assumed by LSTM to be a new time step. - """ - feed_dict = self._feed_dict(inputs) - if outputs is None: - outputs = self.outputs - - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: - feed_dict[self.middleware_embedder.c_in] = self.curr_rnn_c_in - feed_dict[self.middleware_embedder.h_in] = self.curr_rnn_h_in - - output, (self.curr_rnn_c_in, self.curr_rnn_h_in) = self.tp.sess.run([outputs, self.middleware_embedder.state_out], feed_dict=feed_dict) - else: - output = self.tp.sess.run(outputs, feed_dict) - - if squeeze_output: - output = squeeze_list(output) - - return output - - def get_weights(self): - """ - :return: a list of tensors containing the network weights for each layer - """ - return self.trainable_weights - - def set_weights(self, weights, new_rate=1.0): - """ - Sets the network weights from the given list of weights tensors - """ - feed_dict = {} - old_weights, new_weights = self.tp.sess.run([self.get_weights(), weights]) - for placeholder_idx, new_weight in enumerate(new_weights): - feed_dict[self.weights_placeholders[placeholder_idx]]\ - = new_rate * new_weight + (1 - new_rate) * old_weights[placeholder_idx] - self.tp.sess.run(self.update_weights_from_list, feed_dict) - - def write_graph_to_logdir(self, summary_dir): - """ - Writes the tensorflow graph to the logdir for tensorboard visualization - :param summary_dir: the path to the logdir - """ - summary_writer = tf.summary.FileWriter(summary_dir) - summary_writer.add_graph(self.sess.graph) - - def get_variable_value(self, variable): - """ - Get the value of a variable from the graph - :param variable: the variable - :return: the value of the variable - """ - return self.sess.run(variable) - - def set_variable_value(self, assign_op, value, placeholder=None): - """ - Updates the value of a variable. - This requires having an assign operation for the variable, and a placeholder which will provide the value - :param assign_op: an assign operation for the variable - :param value: a value to set the variable to - :param placeholder: a placeholder to hold the given value for injecting it into the variable - """ - self.sess.run(assign_op, feed_dict={placeholder: value}) diff --git a/architectures/tensorflow_components/embedders.py b/architectures/tensorflow_components/embedders.py deleted file mode 100644 index b3f36cb..0000000 --- a/architectures/tensorflow_components/embedders.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import tensorflow as tf -from configurations import EmbedderDepth, EmbedderWidth - - -class InputEmbedder(object): - def __init__(self, input_size, activation_function=tf.nn.relu, - embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, - name="embedder"): - self.name = name - self.input_size = input_size - self.activation_function = activation_function - self.input = None - self.output = None - self.embedder_depth = embedder_depth - self.embedder_width = embedder_width - - def __call__(self, prev_input_placeholder=None): - with tf.variable_scope(self.get_name()): - if prev_input_placeholder is None: - self.input = tf.placeholder("float", shape=(None,) + self.input_size, name=self.get_name()) - else: - self.input = prev_input_placeholder - self._build_module() - - return self.input, self.output - - def _build_module(self): - pass - - def get_name(self): - return self.name - - -class ImageEmbedder(InputEmbedder): - def __init__(self, input_size, input_rescaler=255.0, activation_function=tf.nn.relu, - embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, - name="embedder"): - InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name) - self.input_rescaler = input_rescaler - - def _build_module(self): - # image observation - rescaled_observation_stack = self.input / self.input_rescaler - - if self.embedder_depth == EmbedderDepth.Shallow: - # same embedder as used in the original DQN paper - self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, - filters=32, kernel_size=(8, 8), strides=(4, 4), - activation=self.activation_function, data_format='channels_last', - name='conv1') - self.observation_conv2 = tf.layers.conv2d(self.observation_conv1, - filters=64, kernel_size=(4, 4), strides=(2, 2), - activation=self.activation_function, data_format='channels_last', - name='conv2') - self.observation_conv3 = tf.layers.conv2d(self.observation_conv2, - filters=64, kernel_size=(3, 3), strides=(1, 1), - activation=self.activation_function, data_format='channels_last', - name='conv3' - ) - - self.output = tf.contrib.layers.flatten(self.observation_conv3) - - elif self.embedder_depth == EmbedderDepth.Deep: - # the embedder used in the CARLA papers - self.observation_conv1 = tf.layers.conv2d(rescaled_observation_stack, - filters=32, kernel_size=(5, 5), strides=(2, 2), - activation=self.activation_function, data_format='channels_last', - name='conv1') - self.observation_conv2 = tf.layers.conv2d(self.observation_conv1, - filters=32, kernel_size=(3, 3), strides=(1, 1), - activation=self.activation_function, data_format='channels_last', - name='conv2') - self.observation_conv3 = tf.layers.conv2d(self.observation_conv2, - filters=64, kernel_size=(3, 3), strides=(2, 2), - activation=self.activation_function, data_format='channels_last', - name='conv3') - self.observation_conv4 = tf.layers.conv2d(self.observation_conv3, - filters=64, kernel_size=(3, 3), strides=(1, 1), - activation=self.activation_function, data_format='channels_last', - name='conv4') - self.observation_conv5 = tf.layers.conv2d(self.observation_conv4, - filters=128, kernel_size=(3, 3), strides=(2, 2), - activation=self.activation_function, data_format='channels_last', - name='conv5') - self.observation_conv6 = tf.layers.conv2d(self.observation_conv5, - filters=128, kernel_size=(3, 3), strides=(1, 1), - activation=self.activation_function, data_format='channels_last', - name='conv6') - self.observation_conv7 = tf.layers.conv2d(self.observation_conv6, - filters=256, kernel_size=(3, 3), strides=(2, 2), - activation=self.activation_function, data_format='channels_last', - name='conv7') - self.observation_conv8 = tf.layers.conv2d(self.observation_conv7, - filters=256, kernel_size=(3, 3), strides=(1, 1), - activation=self.activation_function, data_format='channels_last', - name='conv8') - - self.output = tf.contrib.layers.flatten(self.observation_conv8) - else: - raise ValueError("The defined embedder complexity value is invalid") - - -class VectorEmbedder(InputEmbedder): - def __init__(self, input_size, activation_function=tf.nn.relu, - embedder_depth=EmbedderDepth.Shallow, embedder_width=EmbedderWidth.Wide, - name="embedder"): - InputEmbedder.__init__(self, input_size, activation_function, embedder_depth, embedder_width, name) - - def _build_module(self): - # vector observation - input_layer = tf.contrib.layers.flatten(self.input) - - width = 128 if self.embedder_width == EmbedderWidth.Wide else 32 - - if self.embedder_depth == EmbedderDepth.Shallow: - self.output = tf.layers.dense(input_layer, 2*width, activation=self.activation_function, - name='fc1') - - elif self.embedder_depth == EmbedderDepth.Deep: - # the embedder used in the CARLA papers - self.observation_fc1 = tf.layers.dense(input_layer, width, activation=self.activation_function, - name='fc1') - self.observation_fc2 = tf.layers.dense(self.observation_fc1, width, activation=self.activation_function, - name='fc2') - self.output = tf.layers.dense(self.observation_fc2, width, activation=self.activation_function, - name='fc3') - else: - raise ValueError("The defined embedder complexity value is invalid") diff --git a/architectures/tensorflow_components/general_network.py b/architectures/tensorflow_components/general_network.py deleted file mode 100644 index a4e69ff..0000000 --- a/architectures/tensorflow_components/general_network.py +++ /dev/null @@ -1,206 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from architectures.tensorflow_components.embedders import * -from architectures.tensorflow_components.heads import * -from architectures.tensorflow_components.middleware import * -from architectures.tensorflow_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes - - -class GeneralTensorFlowNetwork(TensorFlowArchitecture): - """ - A generalized version of all possible networks implemented using tensorflow. - """ - def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - self.global_network = global_network - self.network_is_local = network_is_local - self.num_heads_per_network = 1 if tuning_parameters.agent.use_separate_networks_per_head else \ - len(tuning_parameters.agent.output_types) - self.num_networks = 1 if not tuning_parameters.agent.use_separate_networks_per_head else \ - len(tuning_parameters.agent.output_types) - self.input_embedders = [] - self.output_heads = [] - self.activation_function = self.get_activation_function( - tuning_parameters.agent.hidden_layers_activation_function) - self.embedder_width = tuning_parameters.agent.embedder_width - - TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) - - def get_activation_function(self, activation_function_string): - activation_functions = { - 'relu': tf.nn.relu, - 'tanh': tf.nn.tanh, - 'sigmoid': tf.nn.sigmoid, - 'elu': tf.nn.elu, - 'selu': tf.nn.selu, - 'none': None - } - assert activation_function_string in activation_functions.keys(), \ - "Activation function must be one of the following {}".format(activation_functions.keys()) - return activation_functions[activation_function_string] - - def get_input_embedder(self, embedder_type): - # the observation can be either an image or a vector - def get_observation_embedding(with_timestep=False): - if self.input_height > 1: - return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", - input_rescaler=self.tp.agent.input_rescaler, embedder_width=self.embedder_width) - else: - return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation", - embedder_width=self.embedder_width) - - input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements", - embedder_width=self.embedder_width), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector", - embedder_width=self.embedder_width), - InputTypes.Action: VectorEmbedder((self.num_actions,), name="action", - embedder_width=self.embedder_width), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), - } - return input_mapping[embedder_type] - - def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: LSTM_Embedder, - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function, self.embedder_width) - - def get_output_head(self, head_type, head_idx, loss_weight=1.): - output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: VHead, - OutputTypes.Pi: PolicyHead, - OutputTypes.MeasurementsPrediction: MeasurementsPredictionHead, - OutputTypes.DNDQ: DNDQHead, - OutputTypes.NAF: NAFHead, - OutputTypes.PPO: PPOHead, - OutputTypes.PPO_V: PPOVHead, - OutputTypes.CategoricalQ: CategoricalQHead, - OutputTypes.QuantileRegressionQ: QuantileRegressionQHead - } - return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) - - def get_model(self, tuning_parameters): - """ - :param tuning_parameters: A Preset class instance with all the running paramaters - :type tuning_parameters: Preset - :return: A model - """ - assert len(self.tp.agent.input_types) > 0, "At least one input type should be defined" - assert len(self.tp.agent.output_types) > 0, "At least one output type should be defined" - assert self.tp.agent.middleware_type is not None, "Exactly one middleware type should be defined" - assert len(self.tp.agent.loss_weights) > 0, "At least one loss weight should be defined" - assert len(self.tp.agent.output_types) == len(self.tp.agent.loss_weights), \ - "Number of loss weights should match the number of output types" - local_network_in_distributed_training = self.global_network is not None and self.network_is_local - - tuning_parameters.activation_function = self.activation_function - - for network_idx in range(self.num_networks): - with tf.variable_scope('network_{}'.format(network_idx)): - #################### - # Input Embeddings # - #################### - - state_embedding = [] - for input_name, input_type in self.tp.agent.input_types.items(): - # get the class of the input embedder - input_embedder = self.get_input_embedder(input_type) - self.input_embedders.append(input_embedder) - - # input placeholders are reused between networks. on the first network, store the placeholders - # generated by the input_embedders in self.inputs. on the rest of the networks, pass - # the existing input_placeholders into the input_embedders. - if network_idx == 0: - input_placeholder, embedding = input_embedder() - self.inputs[input_name] = input_placeholder - else: - input_placeholder, embedding = input_embedder(self.inputs[input_name]) - - state_embedding.append(embedding) - - ############## - # Middleware # - ############## - - state_embedding = tf.concat(state_embedding, axis=-1) if len(state_embedding) > 1 else state_embedding[0] - self.middleware_embedder = self.get_middleware_embedder(self.tp.agent.middleware_type) - _, self.state_embedding = self.middleware_embedder(state_embedding) - - ################ - # Output Heads # - ################ - - for head_idx in range(self.num_heads_per_network): - for head_copy_idx in range(self.tp.agent.num_output_head_copies): - if self.tp.agent.use_separate_networks_per_head: - # if we use separate networks per head, then the head type corresponds top the network idx - head_type_idx = network_idx - else: - # if we use a single network with multiple heads, then the head type is the current head idx - head_type_idx = head_idx - self.output_heads.append(self.get_output_head(self.tp.agent.output_types[head_type_idx], - head_copy_idx, - self.tp.agent.loss_weights[head_type_idx])) - - if self.tp.agent.stop_gradients_from_head[head_idx]: - head_input = tf.stop_gradient(self.state_embedding) - else: - head_input = self.state_embedding - - # build the head - if self.network_is_local: - output, target_placeholder, input_placeholders = self.output_heads[-1](head_input) - self.targets.extend(target_placeholder) - else: - output, input_placeholders = self.output_heads[-1](head_input) - - self.outputs.extend(output) - # TODO: use head names as well - for placeholder_index, input_placeholder in enumerate(input_placeholders): - self.inputs['output_{}_{}'.format(head_idx, placeholder_index)] = input_placeholder - - # Losses - self.losses = tf.losses.get_losses(self.name) - self.losses += tf.losses.get_regularization_losses(self.name) - self.total_loss = tf.losses.compute_weighted_loss(self.losses, scope=self.name) - if self.tp.visualization.tensorboard: - tf.summary.scalar('total_loss', self.total_loss) - - - # Learning rate - if self.tp.learning_rate_decay_rate != 0: - self.tp.learning_rate = tf.train.exponential_decay( - self.tp.learning_rate, self.global_step, decay_steps=self.tp.learning_rate_decay_steps, - decay_rate=self.tp.learning_rate_decay_rate, staircase=True) - - # Optimizer - if local_network_in_distributed_training and \ - hasattr(self.tp.agent, "shared_optimizer") and self.tp.agent.shared_optimizer: - # distributed training and this is the local network instantiation - self.optimizer = self.global_network.optimizer - else: - if tuning_parameters.agent.optimizer_type == 'Adam': - self.optimizer = tf.train.AdamOptimizer(learning_rate=tuning_parameters.learning_rate) - elif tuning_parameters.agent.optimizer_type == 'RMSProp': - self.optimizer = tf.train.RMSPropOptimizer(tuning_parameters.learning_rate, decay=0.9, epsilon=0.01) - elif tuning_parameters.agent.optimizer_type == 'LBFGS': - self.optimizer = tf.contrib.opt.ScipyOptimizerInterface(self.total_loss, method='L-BFGS-B', - options={'maxiter': 25}) - else: - raise Exception("{} is not a valid optimizer type".format(tuning_parameters.agent.optimizer_type)) diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py deleted file mode 100644 index b463d7f..0000000 --- a/architectures/tensorflow_components/heads.py +++ /dev/null @@ -1,558 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import tensorflow as tf -import numpy as np -from utils import force_list - - -# Used to initialize weights for policy and value output layers -def normalized_columns_initializer(std=1.0): - def _initializer(shape, dtype=None, partition_info=None): - out = np.random.randn(*shape).astype(np.float32) - out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) - return tf.constant(out) - return _initializer - - -class Head(object): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - self.head_idx = head_idx - self.name = "head" - self.output = [] - self.loss = [] - self.loss_type = [] - self.regularizations = [] - self.loss_weight = force_list(loss_weight) - self.target = [] - self.input = [] - self.is_local = is_local - - def __call__(self, input_layer): - """ - Wrapper for building the module graph including scoping and loss creation - :param input_layer: the input to the graph - :return: the output of the last layer and the target placeholder - """ - with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()): - self._build_module(input_layer) - - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) - if self.is_local: - self.set_loss() - self._post_build() - - if self.is_local: - return self.output, self.target, self.input - else: - return self.output, self.input - - def _build_module(self, input_layer): - """ - Builds the graph of the module - - This method is called early on from __call__. It is expected to store the graph - in self.output. - - :param input_layer: the input to the graph - :return: None - """ - pass - - def _post_build(self): - """ - Optional function that allows adding any extra definitions after the head has been fully defined - For example, this allows doing additional calculations that are based on the loss - :return: None - """ - pass - - def get_name(self): - """ - Get a formatted name for the module - :return: the formatted name - """ - return '{}_{}'.format(self.name, self.head_idx) - - def set_loss(self): - """ - Creates a target placeholder and loss function for each loss_type and regularization - :param loss_type: a tensorflow loss function - :param scope: the name scope to include the tensors in - :return: None - """ - # add losses and target placeholder - for idx in range(len(self.loss_type)): - target = tf.placeholder('float', self.output[idx].shape, '{}_target'.format(self.get_name())) - self.target.append(target) - loss = self.loss_type[idx](self.target[-1], self.output[idx], - weights=self.loss_weight[idx], scope=self.get_name()) - self.loss.append(loss) - - # add regularizations - for regularization in self.regularizations: - self.loss.append(regularization) - - -class QHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'q_values_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - if tuning_parameters.agent.replace_mse_with_huber_loss: - self.loss_type = tf.losses.huber_loss - else: - self.loss_type = tf.losses.mean_squared_error - - def _build_module(self, input_layer): - # Standard Q Network - self.output = tf.layers.dense(input_layer, self.num_actions, name='output') - - -class DuelingQHead(QHead): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - QHead.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - - def _build_module(self, input_layer): - # state value tower - V - with tf.variable_scope("state_value"): - state_value = tf.layers.dense(input_layer, 256, activation=tf.nn.relu, name='fc1') - state_value = tf.layers.dense(state_value, 1, name='fc2') - # state_value = tf.expand_dims(state_value, axis=-1) - - # action advantage tower - A - with tf.variable_scope("action_advantage"): - action_advantage = tf.layers.dense(input_layer, 256, activation=tf.nn.relu, name='fc1') - action_advantage = tf.layers.dense(action_advantage, self.num_actions, name='fc2') - action_advantage = action_advantage - tf.reduce_mean(action_advantage) - - # merge to state-action value function Q - self.output = tf.add(state_value, action_advantage, name='output') - - -class VHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'v_values_head' - if tuning_parameters.agent.replace_mse_with_huber_loss: - self.loss_type = tf.losses.huber_loss - else: - self.loss_type = tf.losses.mean_squared_error - - def _build_module(self, input_layer): - # Standard V Network - self.output = tf.layers.dense(input_layer, 1, name='output', - kernel_initializer=normalized_columns_initializer(1.0)) - - -class PolicyHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'policy_values_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range) - self.discrete_controls = tuning_parameters.env_instance.discrete_controls - self.exploration_policy = tuning_parameters.exploration.policy - self.exploration_variance = 2*self.output_scale*tuning_parameters.exploration.initial_noise_variance_percentage - if not self.discrete_controls and not self.output_scale: - raise ValueError("For continuous controls, an output scale for the network must be specified") - self.beta = tuning_parameters.agent.beta_entropy - - def _build_module(self, input_layer): - eps = 1e-15 - if self.discrete_controls: - self.actions = tf.placeholder(tf.int32, [None], name="actions") - else: - self.actions = tf.placeholder(tf.float32, [None, self.num_actions], name="actions") - self.input = [self.actions] - - # Policy Head - if self.discrete_controls: - policy_values = tf.layers.dense(input_layer, self.num_actions, name='fc') - self.policy_mean = tf.nn.softmax(policy_values, name="policy") - - # define the distributions for the policy and the old policy - # (the + eps is to prevent probability 0 which will cause the log later on to be -inf) - self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_mean + eps)) - self.output = self.policy_mean - else: - # mean - policy_values_mean = tf.layers.dense(input_layer, self.num_actions, activation=tf.nn.tanh, name='fc_mean') - self.policy_mean = tf.multiply(policy_values_mean, self.output_scale, name='output_mean') - - self.output = [self.policy_mean] - - # std - if self.exploration_policy == 'ContinuousEntropy': - policy_values_std = tf.layers.dense(input_layer, self.num_actions, - kernel_initializer=normalized_columns_initializer(0.01), name='fc_std') - self.policy_std = tf.nn.softplus(policy_values_std, name='output_variance') + eps - - self.output.append(self.policy_std) - - else: - self.policy_std = tf.constant(self.exploration_variance, dtype='float32', shape=(self.num_actions,)) - - # define the distributions for the policy and the old policy - self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, - self.policy_std) - - if self.is_local: - # add entropy regularization - if self.beta: - self.entropy = tf.reduce_mean(self.policy_distribution.entropy()) - self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization') - tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations) - - # calculate loss - self.action_log_probs_wrt_policy = self.policy_distribution.log_prob(self.actions) - self.advantages = tf.placeholder(tf.float32, [None], name="advantages") - self.target = self.advantages - self.loss = -tf.reduce_mean(self.action_log_probs_wrt_policy * self.advantages) - tf.losses.add_loss(self.loss_weight[0] * self.loss) - - -class MeasurementsPredictionHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'future_measurements_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.num_measurements = tuning_parameters.env.measurements_size[0] \ - if tuning_parameters.env.measurements_size else 0 - self.num_prediction_steps = tuning_parameters.agent.num_predicted_steps_ahead - self.multi_step_measurements_size = self.num_measurements * self.num_prediction_steps - if tuning_parameters.agent.replace_mse_with_huber_loss: - self.loss_type = tf.losses.huber_loss - else: - self.loss_type = tf.losses.mean_squared_error - - def _build_module(self, input_layer): - # This is almost exactly the same as Dueling Network but we predict the future measurements for each action - # actions expectation tower (expectation stream) - E - with tf.variable_scope("expectation_stream"): - expectation_stream = tf.layers.dense(input_layer, 256, activation=tf.nn.elu, name='fc1') - expectation_stream = tf.layers.dense(expectation_stream, self.multi_step_measurements_size, name='output') - expectation_stream = tf.expand_dims(expectation_stream, axis=1) - - # action fine differences tower (action stream) - A - with tf.variable_scope("action_stream"): - action_stream = tf.layers.dense(input_layer, 256, activation=tf.nn.elu, name='fc1') - action_stream = tf.layers.dense(action_stream, self.num_actions * self.multi_step_measurements_size, - name='output') - action_stream = tf.reshape(action_stream, - (tf.shape(action_stream)[0], self.num_actions, self.multi_step_measurements_size)) - action_stream = action_stream - tf.reduce_mean(action_stream, reduction_indices=1, keep_dims=True) - - # merge to future measurements predictions - self.output = tf.add(expectation_stream, action_stream, name='output') - - -class DNDQHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'dnd_q_values_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.DND_size = tuning_parameters.agent.dnd_size - self.DND_key_error_threshold = tuning_parameters.agent.DND_key_error_threshold - self.l2_norm_added_delta = tuning_parameters.agent.l2_norm_added_delta - self.new_value_shift_coefficient = tuning_parameters.agent.new_value_shift_coefficient - self.number_of_nn = tuning_parameters.agent.number_of_knn - if tuning_parameters.agent.replace_mse_with_huber_loss: - self.loss_type = tf.losses.huber_loss - else: - self.loss_type = tf.losses.mean_squared_error - self.tp = tuning_parameters - self.dnd_embeddings = [None]*self.num_actions - self.dnd_values = [None]*self.num_actions - self.dnd_indices = [None]*self.num_actions - - def _build_module(self, input_layer): - # DND based Q head - from memories import differentiable_neural_dictionary - - if self.tp.checkpoint_restore_dir: - self.DND = differentiable_neural_dictionary.load_dnd(self.tp.checkpoint_restore_dir) - else: - self.DND = differentiable_neural_dictionary.QDND( - self.DND_size, input_layer.get_shape()[-1], self.num_actions, self.new_value_shift_coefficient, - key_error_threshold=self.DND_key_error_threshold, learning_rate=self.tp.learning_rate) - - # Retrieve info from DND dictionary - # We assume that all actions have enough entries in the DND - self.output = tf.transpose([ - self._q_value(input_layer, action) - for action in range(self.num_actions) - ]) - - def _q_value(self, input_layer, action): - result = tf.py_func(self.DND.query, - [input_layer, action, self.number_of_nn], - [tf.float64, tf.float64, tf.int64]) - self.dnd_embeddings[action] = tf.to_float(result[0]) - self.dnd_values[action] = tf.to_float(result[1]) - self.dnd_indices[action] = result[2] - - # DND calculation - square_diff = tf.square(self.dnd_embeddings[action] - tf.expand_dims(input_layer, 1)) - distances = tf.reduce_sum(square_diff, axis=2) + [self.l2_norm_added_delta] - weights = 1.0 / distances - normalised_weights = weights / tf.reduce_sum(weights, axis=1, keep_dims=True) - return tf.reduce_sum(self.dnd_values[action] * normalised_weights, axis=1) - - -class NAFHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'naf_q_values_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range) - if tuning_parameters.agent.replace_mse_with_huber_loss: - self.loss_type = tf.losses.huber_loss - else: - self.loss_type = tf.losses.mean_squared_error - - def _build_module(self, input_layer): - # NAF - self.action = tf.placeholder(tf.float32, [None, self.num_actions], name="action") - self.input = self.action - - # V Head - self.V = tf.layers.dense(input_layer, 1, name='V') - - # mu Head - mu_unscaled = tf.layers.dense(input_layer, self.num_actions, activation=tf.nn.tanh, name='mu_unscaled') - self.mu = tf.multiply(mu_unscaled, self.output_scale, name='mu') - - # A Head - # l_vector is a vector that includes a lower-triangular matrix values - self.l_vector = tf.layers.dense(input_layer, (self.num_actions * (self.num_actions + 1)) / 2, name='l_vector') - - # Convert l to a lower triangular matrix and exponentiate its diagonal - - i = 0 - columns = [] - for col in range(self.num_actions): - start_row = col - num_non_zero_elements = self.num_actions - start_row - zeros_column_part = tf.zeros_like(self.l_vector[:, 0:start_row]) - diag_element = tf.expand_dims(tf.exp(self.l_vector[:, i]), 1) - non_zeros_non_diag_column_part = self.l_vector[:, (i + 1):(i + num_non_zero_elements)] - columns.append(tf.concat([zeros_column_part, diag_element, non_zeros_non_diag_column_part], axis=1)) - i += num_non_zero_elements - self.L = tf.transpose(tf.stack(columns, axis=1), (0, 2, 1)) - - # P = L*L^T - self.P = tf.matmul(self.L, tf.transpose(self.L, (0, 2, 1))) - - # A = -1/2 * (u - mu)^T * P * (u - mu) - action_diff = tf.expand_dims(self.action - self.mu, -1) - a_matrix_form = -0.5 * tf.matmul(tf.transpose(action_diff, (0, 2, 1)), tf.matmul(self.P, action_diff)) - self.A = tf.reshape(a_matrix_form, [-1, 1]) - - # Q Head - self.Q = tf.add(self.V, self.A, name='Q') - - self.output = self.Q - - -class PPOHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'ppo_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.discrete_controls = tuning_parameters.env_instance.discrete_controls - self.output_scale = np.max(tuning_parameters.env_instance.action_space_abs_range) - - # kl coefficient and its corresponding assignment operation and placeholder - self.kl_coefficient = tf.Variable(tuning_parameters.agent.initial_kl_coefficient, - trainable=False, name='kl_coefficient') - self.kl_coefficient_ph = tf.placeholder('float', name='kl_coefficient_ph') - self.assign_kl_coefficient = tf.assign(self.kl_coefficient, self.kl_coefficient_ph) - - self.kl_cutoff = 2*tuning_parameters.agent.target_kl_divergence - self.high_kl_penalty_coefficient = tuning_parameters.agent.high_kl_penalty_coefficient - self.clip_likelihood_ratio_using_epsilon = tuning_parameters.agent.clip_likelihood_ratio_using_epsilon - self.use_kl_regularization = tuning_parameters.agent.use_kl_regularization - self.beta = tuning_parameters.agent.beta_entropy - - def _build_module(self, input_layer): - eps = 1e-15 - if self.discrete_controls: - self.actions = tf.placeholder(tf.int32, [None], name="actions") - else: - self.actions = tf.placeholder(tf.float32, [None, self.num_actions], name="actions") - self.old_policy_mean = tf.placeholder(tf.float32, [None, self.num_actions], "old_policy_mean") - self.old_policy_std = tf.placeholder(tf.float32, [None, self.num_actions], "old_policy_std") - - # Policy Head - if self.discrete_controls: - self.input = [self.actions, self.old_policy_mean] - policy_values = tf.layers.dense(input_layer, self.num_actions, name='policy_fc') - self.policy_mean = tf.nn.softmax(policy_values, name="policy") - - # define the distributions for the policy and the old policy - self.policy_distribution = tf.contrib.distributions.Categorical(probs=(self.policy_mean + eps)) - self.old_policy_distribution = tf.contrib.distributions.Categorical(probs=self.old_policy_mean) - - self.output = self.policy_mean - else: - self.input = [self.actions, self.old_policy_mean, self.old_policy_std] - self.policy_mean = tf.layers.dense(input_layer, self.num_actions, name='policy_mean') - self.policy_logstd = tf.Variable(np.zeros((1, self.num_actions)), dtype='float32') - self.policy_std = tf.tile(tf.exp(self.policy_logstd), [tf.shape(input_layer)[0], 1], name='policy_std') - - # define the distributions for the policy and the old policy - self.policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.policy_mean, - self.policy_std) - self.old_policy_distribution = tf.contrib.distributions.MultivariateNormalDiag(self.old_policy_mean, - self.old_policy_std) - - self.output = [self.policy_mean, self.policy_std] - - self.action_probs_wrt_policy = tf.exp(self.policy_distribution.log_prob(self.actions)) - self.action_probs_wrt_old_policy = tf.exp(self.old_policy_distribution.log_prob(self.actions)) - self.entropy = tf.reduce_mean(self.policy_distribution.entropy()) - - # add kl divergence regularization - self.kl_divergence = tf.reduce_mean(tf.contrib.distributions.kl_divergence(self.old_policy_distribution, - self.policy_distribution)) - if self.use_kl_regularization: - # no clipping => use kl regularization - self.weighted_kl_divergence = tf.multiply(self.kl_coefficient, self.kl_divergence) - self.regularizations = self.weighted_kl_divergence + self.high_kl_penalty_coefficient * \ - tf.square(tf.maximum(0.0, self.kl_divergence - self.kl_cutoff)) - tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations) - - # calculate surrogate loss - self.advantages = tf.placeholder(tf.float32, [None], name="advantages") - self.target = self.advantages - self.likelihood_ratio = self.action_probs_wrt_policy / (self.action_probs_wrt_old_policy + eps) - if self.clip_likelihood_ratio_using_epsilon is not None: - max_value = 1 + self.clip_likelihood_ratio_using_epsilon - min_value = 1 - self.clip_likelihood_ratio_using_epsilon - self.clipped_likelihood_ratio = tf.clip_by_value(self.likelihood_ratio, min_value, max_value) - self.scaled_advantages = tf.minimum(self.likelihood_ratio * self.advantages, - self.clipped_likelihood_ratio * self.advantages) - else: - self.scaled_advantages = self.likelihood_ratio * self.advantages - # minus sign is in order to set an objective to minimize (we actually strive for maximizing the surrogate loss) - self.surrogate_loss = -tf.reduce_mean(self.scaled_advantages) - if self.is_local: - # add entropy regularization - if self.beta: - self.entropy = tf.reduce_mean(self.policy_distribution.entropy()) - self.regularizations = -tf.multiply(self.beta, self.entropy, name='entropy_regularization') - tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, self.regularizations) - - self.loss = self.surrogate_loss - tf.losses.add_loss(self.loss) - - -class PPOVHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'ppo_v_head' - self.clip_likelihood_ratio_using_epsilon = tuning_parameters.agent.clip_likelihood_ratio_using_epsilon - - def _build_module(self, input_layer): - self.old_policy_value = tf.placeholder(tf.float32, [None], "old_policy_values") - self.input = [self.old_policy_value] - self.output = tf.layers.dense(input_layer, 1, name='output', - kernel_initializer=normalized_columns_initializer(1.0)) - self.target = self.total_return = tf.placeholder(tf.float32, [None], name="total_return") - - value_loss_1 = tf.square(self.output - self.target) - value_loss_2 = tf.square(self.old_policy_value + - tf.clip_by_value(self.output - self.old_policy_value, - -self.clip_likelihood_ratio_using_epsilon, - self.clip_likelihood_ratio_using_epsilon) - self.target) - self.vf_loss = tf.reduce_mean(tf.maximum(value_loss_1, value_loss_2)) - self.loss = self.vf_loss - tf.losses.add_loss(self.loss) - - -class CategoricalQHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'categorical_dqn_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.num_atoms = tuning_parameters.agent.atoms - - def _build_module(self, input_layer): - self.actions = tf.placeholder(tf.int32, [None], name="actions") - self.input = [self.actions] - - values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output') - values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions, self.num_atoms)) - # softmax on atoms dimension - self.output = tf.nn.softmax(values_distribution) - - # calculate cross entropy loss - self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms), name="distributions") - self.target = self.distributions - self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution) - tf.losses.add_loss(self.loss) - - -class QuantileRegressionQHead(Head): - def __init__(self, tuning_parameters, head_idx=0, loss_weight=1., is_local=True): - Head.__init__(self, tuning_parameters, head_idx, loss_weight, is_local) - self.name = 'quantile_regression_dqn_head' - self.num_actions = tuning_parameters.env_instance.action_space_size - self.num_atoms = tuning_parameters.agent.atoms # we use atom / quantile interchangeably - self.huber_loss_interval = 1 # k - - def _build_module(self, input_layer): - self.actions = tf.placeholder(tf.int32, [None, 2], name="actions") - self.quantile_midpoints = tf.placeholder(tf.float32, [None, self.num_atoms], name="quantile_midpoints") - self.input = [self.actions, self.quantile_midpoints] - - # the output of the head is the N unordered quantile locations {theta_1, ..., theta_N} - quantiles_locations = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output') - quantiles_locations = tf.reshape(quantiles_locations, (tf.shape(quantiles_locations)[0], self.num_actions, self.num_atoms)) - self.output = quantiles_locations - - self.quantiles = tf.placeholder(tf.float32, shape=(None, self.num_atoms), name="quantiles") - self.target = self.quantiles - - # only the quantiles of the taken action are taken into account - quantiles_for_used_actions = tf.gather_nd(quantiles_locations, self.actions) - - # reorder the output quantiles and the target quantiles as a preparation step for calculating the loss - # the output quantiles vector and the quantile midpoints are tiled as rows of a NxN matrix (N = num quantiles) - # the target quantiles vector is tiled as column of a NxN matrix - theta_i = tf.tile(tf.expand_dims(quantiles_for_used_actions, -1), [1, 1, self.num_atoms]) - T_theta_j = tf.tile(tf.expand_dims(self.target, -2), [1, self.num_atoms, 1]) - tau_i = tf.tile(tf.expand_dims(self.quantile_midpoints, -1), [1, 1, self.num_atoms]) - - # Huber loss of T(theta_j) - theta_i - error = T_theta_j - theta_i - abs_error = tf.abs(error) - quadratic = tf.minimum(abs_error, self.huber_loss_interval) - huber_loss = self.huber_loss_interval * (abs_error - quadratic) + 0.5 * quadratic ** 2 - - # Quantile Huber loss - quantile_huber_loss = tf.abs(tau_i - tf.cast(error < 0, dtype=tf.float32)) * huber_loss - - # Quantile regression loss (the probability for each quantile is 1/num_quantiles) - quantile_regression_loss = tf.reduce_sum(quantile_huber_loss) / float(self.num_atoms) - self.loss = quantile_regression_loss - tf.losses.add_loss(self.loss) diff --git a/architectures/tensorflow_components/middleware.py b/architectures/tensorflow_components/middleware.py deleted file mode 100644 index eee5925..0000000 --- a/architectures/tensorflow_components/middleware.py +++ /dev/null @@ -1,77 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import tensorflow as tf -import numpy as np -from configurations import EmbedderWidth - - -class MiddlewareEmbedder(object): - def __init__(self, activation_function=tf.nn.relu, embedder_width=EmbedderWidth.Wide, name="middleware_embedder"): - self.name = name - self.input = None - self.output = None - self.embedder_width = embedder_width - self.activation_function = activation_function - - def __call__(self, input_layer): - with tf.variable_scope(self.get_name()): - self.input = input_layer - self._build_module() - - return self.input, self.output - - def _build_module(self): - pass - - def get_name(self): - return self.name - - -class LSTM_Embedder(MiddlewareEmbedder): - def _build_module(self): - """ - self.state_in: tuple of placeholders containing the initial state - self.state_out: tuple of output state - - todo: it appears that the shape of the output is batch, feature - the code here seems to be slicing off the first element in the batch - which would definitely be wrong. need to double check the shape - """ - - middleware = tf.layers.dense(self.input, 512, activation=self.activation_function, name='fc1') - lstm_cell = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) - self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) - self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) - self.state_init = [self.c_init, self.h_init] - self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) - self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) - self.state_in = (self.c_in, self.h_in) - rnn_in = tf.expand_dims(middleware, [0]) - step_size = tf.shape(middleware)[:1] - state_in = tf.contrib.rnn.LSTMStateTuple(self.c_in, self.h_in) - lstm_outputs, lstm_state = tf.nn.dynamic_rnn( - lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) - lstm_c, lstm_h = lstm_state - self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) - self.output = tf.reshape(lstm_outputs, [-1, 256]) - - -class FC_Embedder(MiddlewareEmbedder): - def _build_module(self): - width = 512 if self.embedder_width == EmbedderWidth.Wide else 64 - self.output = tf.layers.dense(self.input, width, activation=self.activation_function, name='fc1') - diff --git a/architectures/tensorflow_components/shared_variables.py b/architectures/tensorflow_components/shared_variables.py deleted file mode 100644 index 2775251..0000000 --- a/architectures/tensorflow_components/shared_variables.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import tensorflow as tf -import numpy as np - - -class SharedRunningStats(object): - def __init__(self, tuning_parameters, replicated_device, epsilon=1e-2, shape=(), name=""): - self.tp = tuning_parameters - with tf.device(replicated_device): - with tf.variable_scope(name): - self._sum = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(0.0), - name="running_sum", trainable=False) - self._sum_squared = tf.get_variable( - dtype=tf.float64, - shape=shape, - initializer=tf.constant_initializer(epsilon), - name="running_sum_squared", trainable=False) - self._count = tf.get_variable( - dtype=tf.float64, - shape=(), - initializer=tf.constant_initializer(epsilon), - name="count", trainable=False) - - self._shape = shape - self._mean = self._sum / self._count - self._std = tf.sqrt(tf.maximum((self._sum_squared - self._count*tf.square(self._mean)) - / tf.maximum(self._count-1, 1), epsilon)) - - self.new_sum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') - self.new_sum_squared = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') - self.newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') - - self._inc_sum = tf.assign_add(self._sum, self.new_sum, use_locking=True) - self._inc_sum_squared = tf.assign_add(self._sum_squared, self.new_sum_squared, use_locking=True) - self._inc_count = tf.assign_add(self._count, self.newcount, use_locking=True) - - def push(self, x): - x = x.astype('float64') - self.tp.sess.run([self._inc_sum, self._inc_sum_squared, self._inc_count], - feed_dict={ - self.new_sum: x.sum(axis=0).ravel(), - self.new_sum_squared: np.square(x).sum(axis=0).ravel(), - self.newcount: np.array(len(x), dtype='float64') - }) - - @property - def n(self): - return self.tp.sess.run(self._count) - - @property - def mean(self): - return self.tp.sess.run(self._mean) - - @property - def var(self): - return self.std ** 2 - - @property - def std(self): - return self.tp.sess.run(self._std) - - @property - def shape(self): - return self._shape \ No newline at end of file diff --git a/benchmarks/README.md b/benchmarks/README.md index ba237e7..0c00442 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,172 +1,44 @@ # Coach Benchmarks -The following figures are training curves of some of the presets available through Coach. -The X axis in all the figures is the total steps (for multi-threaded runs, this is the accumulated number of steps over all the workers). -The Y axis in all the figures is the average episode reward with an averaging window of 11 episodes. +The following table represents the current status of algorithms implemented in Coach relative to the results reported in the original papers. The detailed results for each algorithm can be seen by clicking on its name. + +The X axis in all the figures is the total steps (for multi-threaded runs, this is the number of steps per worker). +The Y axis in all the figures is the average episode reward with an averaging window of 100 timesteps. + +For each algorithm, there is a command line for reproducing the results of each graph. These are the results you can expect to get when running the pre-defined presets in Coach. +The environments that were used for testing include: +* **Atari** - Breakout, Pong and Space Invaders +* **Mujoco** - Inverted Pendulum, Inverted Double Pendulum, Reacher, Hopper, Half Cheetah, Walker 2D, Ant, Swimmer and Humanoid. +* **Doom** - Basic, Health Gathering (D1: Basic), Health Gathering Supreme (D2: Navigation), Battle (D3: Battle) +* **Fetch** - Reach, Slide, Push, Pick-and-Place -## A3C +## Summary -### Breakout_A3C with 16 workers +![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) *Reproducing paper's results* -```bash -python3 coach.py -p Breakout_A3C -n 16 -r -``` +![#ceffad](https://placehold.it/15/ceffad/000000?text=+) *Reproducing paper's results for some of the environments* -Breakout_A3C_16_workers +![#FFA500](https://placehold.it/15/FFA500/000000?text=+) *Training but not reproducing paper's results* -### InvertedPendulum_A3C with 16 workers +![#FF4040](https://placehold.it/15/FF4040/000000?text=+) *Not training* -```bash -python3 coach.py -p InvertedPendulum_A3C -n 16 -r -``` -Inverted_Pendulum_A3C_16_workers +| |**Status** |**Environments**|**Comments**| +| ----------------------- |:--------------------------------------------------------:|:--------------:|:--------:| +|**[DQN](dqn)** | ![#ceffad](https://placehold.it/15/ceffad/000000?text=+) |Atari | Pong is not training | +|**[Dueling DDQN](dueling_ddqn)**| ![#ceffad](https://placehold.it/15/ceffad/000000?text=+) |Atari | Pong is not training | +|**[Dueling DDQN with PER](dueling_ddqn_with_per)**| ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Atari | | +|**[Bootstrapped DQN](bootstrapped_dqn)**| ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Atari | | +|**[QR-DQN](qr_dqn)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Atari | | +|**[A3C](a3c)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Atari, Mujoco | | +|**[Clipped PPO](clipped_ppo)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Mujoco | | +|**[DDPG](ddpg)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Mujoco | | +|**[NEC](nec)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Atari | | +|**[HER](ddpg_her)** | ![#2E8B57](https://placehold.it/15/2E8B57/000000?text=+) |Fetch | | +|**[HAC](hac)** | ![#969696](https://placehold.it/15/969696/000000?text=+) |Pendulum | | +|**[DFP](dfp)** | ![#ceffad](https://placehold.it/15/ceffad/000000?text=+) |Doom | Doom Battle was not verified | -### Hopper_A3C with 16 workers -```bash -python3 coach.py -p Hopper_A3C -n 16 -r -``` - -Hopper_A3C_16_workers - -### Ant_A3C with 16 workers - -```bash -python3 coach.py -p Ant_A3C -n 16 -r -``` - -Ant_A3C_16_workers - -## Clipped PPO - -### InvertedPendulum_ClippedPPO with 16 workers - -```bash -python3 coach.py -p InvertedPendulum_ClippedPPO -n 16 -r -``` - -InvertedPendulum_ClippedPPO_16_workers - -### Hopper_ClippedPPO with 16 workers - -```bash -python3 coach.py -p Hopper_ClippedPPO -n 16 -r -``` - -Hopper_Clipped_PPO_16_workers - -### Humanoid_ClippedPPO with 16 workers - -```bash -python3 coach.py -p Humanoid_ClippedPPO -n 16 -r -``` - -Humanoid_ClippedPPO_16_workers - -## DQN - -### Pong_DQN - -```bash -python3 coach.py -p Pong_DQN -r -``` - -Pong_DQN - -### Doom_Basic_DQN - -```bash -python3 coach.py -p Doom_Basic_DQN -r -``` - -Doom_Basic_DQN - -## Dueling DDQN - -### Doom_Basic_Dueling_DDQN - -```bash -python3 coach.py -p Doom_Basic_Dueling_DDQN -r -``` - -Doom_Basic_Dueling_DDQN - -## DFP - -### Doom_Health_DFP - -```bash -python3 coach.py -p Doom_Health_DFP -r -``` - -Doom_Health_DFP - -## MMC - -### Doom_Health_MMC - -```bash -python3 coach.py -p Doom_Health_MMC -r -``` - -Doom_Health_MMC - -## NEC - -## Pong_NEC - -```bash -python3 coach.py -p Pong_NEC -r -``` - -Pong_NEC - -## Doom_Basic_NEC - -```bash -python3 coach.py -p Doom_Basic_NEC -r -``` - -Doom_Basic_NEC - -## PG - -### CartPole_PG - -```bash -python3 coach.py -p CartPole_PG -r -``` - -CartPole_PG - -## DDPG - -### Pendulum_DDPG - -```bash -python3 coach.py -p Pendulum_DDPG -r -``` - -Pendulum_DDPG - - -## NAF - -### InvertedPendulum_NAF - -```bash -python3 coach.py -p InvertedPendulum_NAF -r -``` - -InvertedPendulum_NAF - -### Pendulum_NAF - -```bash -python3 coach.py -p Pendulum_NAF -r -``` - -Pendulum_NAF +**Click on each algorithm to see detailed benchmarking results** diff --git a/benchmarks/a3c/README.md b/benchmarks/a3c/README.md new file mode 100644 index 0000000..8fde621 --- /dev/null +++ b/benchmarks/a3c/README.md @@ -0,0 +1,43 @@ +# A3C + +Each experiment uses 3 seeds. +The parameters used for Clipped PPO are the same parameters as described in the [original paper](https://arxiv.org/abs/1707.06347). + +### Inverted Pendulum A3C - 1/2/4/8/16 workers + +```bash +python3 coach.py -p Mujoco_A3C -lvl inverted_pendulum -n 1 +python3 coach.py -p Mujoco_A3C -lvl inverted_pendulum -n 2 +python3 coach.py -p Mujoco_A3C -lvl inverted_pendulum -n 4 +python3 coach.py -p Mujoco_A3C -lvl inverted_pendulum -n 8 +python3 coach.py -p Mujoco_A3C -lvl inverted_pendulum -n 16 +``` + +Inverted Pendulum A3C + + +### Hopper A3C - 16 workers + +```bash +python3 coach.py -p Mujoco_A3C -lvl hopper -n 16 +``` + +Hopper A3C 16 workers + + +### Walker2D A3C - 16 workers + +```bash +python3 coach.py -p Mujoco_A3C -lvl walker2d -n 16 +``` + +Walker2D A3C 16 workers + + +### Space Invaders A3C - 16 workers + +```bash +python3 coach.py -p Atari_A3C -lvl space_invaders -n 16 +``` + +Space Invaders A3C 16 workers diff --git a/benchmarks/a3c/hopper_a3c_16_workers.png b/benchmarks/a3c/hopper_a3c_16_workers.png new file mode 100644 index 0000000..4607f6a Binary files /dev/null and b/benchmarks/a3c/hopper_a3c_16_workers.png differ diff --git a/benchmarks/a3c/inverted_pendulum_a3c.png b/benchmarks/a3c/inverted_pendulum_a3c.png new file mode 100644 index 0000000..65b1720 Binary files /dev/null and b/benchmarks/a3c/inverted_pendulum_a3c.png differ diff --git a/benchmarks/a3c/space_invaders_a3c_16_workers.png b/benchmarks/a3c/space_invaders_a3c_16_workers.png new file mode 100644 index 0000000..9208f89 Binary files /dev/null and b/benchmarks/a3c/space_invaders_a3c_16_workers.png differ diff --git a/benchmarks/a3c/walker2d_a3c_16_workers.png b/benchmarks/a3c/walker2d_a3c_16_workers.png new file mode 100644 index 0000000..a003359 Binary files /dev/null and b/benchmarks/a3c/walker2d_a3c_16_workers.png differ diff --git a/benchmarks/bootstrapped_dqn/README.md b/benchmarks/bootstrapped_dqn/README.md new file mode 100644 index 0000000..8a5f059 --- /dev/null +++ b/benchmarks/bootstrapped_dqn/README.md @@ -0,0 +1,31 @@ +# Bootstrapped DQN + +Each experiment uses 3 seeds. +The parameters used for Bootstrapped DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1602.04621.pdf). + +### Breakout Bootstrapped DQN - single worker + +```bash +python3 coach.py -p Atari_Bootstrapped_DQN -lvl breakout +``` + +Breakout Bootstrapped DQN + + +### Pong Bootstrapped DQN - single worker + +```bash +python3 coach.py -p Atari_Bootstrapped_DQN -lvl pong +``` + +Pong Bootstrapped DQN + + +### Space Invaders Bootstrapped DQN - single worker + +```bash +python3 coach.py -p Atari_Bootstrapped_DQN -lvl space_invaders +``` + +Space Invaders Bootstrapped DQN + diff --git a/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png b/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png new file mode 100644 index 0000000..b38b6fc Binary files /dev/null and b/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png differ diff --git a/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png b/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png new file mode 100644 index 0000000..af7ca76 Binary files /dev/null and b/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png differ diff --git a/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png b/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png new file mode 100644 index 0000000..1494f40 Binary files /dev/null and b/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png differ diff --git a/benchmarks/clipped_ppo/README.md b/benchmarks/clipped_ppo/README.md new file mode 100644 index 0000000..00f2766 --- /dev/null +++ b/benchmarks/clipped_ppo/README.md @@ -0,0 +1,84 @@ +# Clipped PPO + +Each experiment uses 3 seeds and is trained for 10k environment steps. +The parameters used for Clipped PPO are the same parameters as described in the [original paper](https://arxiv.org/abs/1707.06347). + +### Inverted Pendulum Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl inverted_pendulum +``` + +Inverted Pendulum Clipped PPO + + +### Inverted Double Pendulum Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl inverted_double_pendulum +``` + +Inverted Double Pendulum Clipped PPO + + +### Reacher Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl reacher +``` + +Reacher Clipped PPO + + +### Hopper Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl hopper +``` + +Hopper Clipped PPO + + +### Half Cheetah Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl half_cheetah +``` + +Half Cheetah Clipped PPO + + +### Walker 2D Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl walker2d +``` + +Walker 2D Clipped PPO + + +### Ant Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl ant +``` + +Ant Clipped PPO + + +### Swimmer Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl swimmer +``` + +Swimmer Clipped PPO + + +### Humanoid Clipped PPO - single worker + +```bash +python3 coach.py -p Mujoco_ClippedPPO -lvl humanoid +``` + +Humanoid Clipped PPO diff --git a/benchmarks/clipped_ppo/ant_clipped_ppo.png b/benchmarks/clipped_ppo/ant_clipped_ppo.png new file mode 100644 index 0000000..d500180 Binary files /dev/null and b/benchmarks/clipped_ppo/ant_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png b/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png new file mode 100644 index 0000000..fc4c5b9 Binary files /dev/null and b/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/hopper_clipped_ppo.png b/benchmarks/clipped_ppo/hopper_clipped_ppo.png new file mode 100644 index 0000000..79cc2bf Binary files /dev/null and b/benchmarks/clipped_ppo/hopper_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/humanoid_clipped_ppo.png b/benchmarks/clipped_ppo/humanoid_clipped_ppo.png new file mode 100644 index 0000000..1612430 Binary files /dev/null and b/benchmarks/clipped_ppo/humanoid_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png b/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png new file mode 100644 index 0000000..6473460 Binary files /dev/null and b/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png b/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png new file mode 100644 index 0000000..0302d17 Binary files /dev/null and b/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/reacher_clipped_ppo.png b/benchmarks/clipped_ppo/reacher_clipped_ppo.png new file mode 100644 index 0000000..d58e3e6 Binary files /dev/null and b/benchmarks/clipped_ppo/reacher_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/swimmer_clipped_ppo.png b/benchmarks/clipped_ppo/swimmer_clipped_ppo.png new file mode 100644 index 0000000..7fd0e8f Binary files /dev/null and b/benchmarks/clipped_ppo/swimmer_clipped_ppo.png differ diff --git a/benchmarks/clipped_ppo/walker2d_clipped_ppo.png b/benchmarks/clipped_ppo/walker2d_clipped_ppo.png new file mode 100644 index 0000000..3150b70 Binary files /dev/null and b/benchmarks/clipped_ppo/walker2d_clipped_ppo.png differ diff --git a/benchmarks/ddpg/README.md b/benchmarks/ddpg/README.md new file mode 100644 index 0000000..f10fa0e --- /dev/null +++ b/benchmarks/ddpg/README.md @@ -0,0 +1,84 @@ +# DDPG + +Each experiment uses 3 seeds and is trained for 2k environment steps. +The parameters used for DDPG are the same parameters as described in the [original paper](https://arxiv.org/abs/1509.02971). + +### Inverted Pendulum DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl inverted_pendulum +``` + +Inverted Pendulum DDPG + + +### Inverted Double Pendulum DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl inverted_double_pendulum +``` + +Inverted Double Pendulum DDPG + + +### Reacher DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl reacher +``` + +Reacher DDPG + + +### Hopper DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl hopper +``` + +Hopper DDPG + + +### Half Cheetah DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl half_cheetah +``` + +Half Cheetah DDPG + + +### Walker 2D DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl walker2d +``` + +Walker 2D DDPG + + +### Ant DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl ant +``` + +Ant DDPG + + +### Swimmer DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl swimmer +``` + +Swimmer DDPG + + +### Humanoid DDPG - single worker + +```bash +python3 coach.py -p Mujoco_DDPG -lvl humanoid +``` + +Humanoid DDPG diff --git a/benchmarks/ddpg/ant_ddpg.png b/benchmarks/ddpg/ant_ddpg.png new file mode 100644 index 0000000..61678c1 Binary files /dev/null and b/benchmarks/ddpg/ant_ddpg.png differ diff --git a/benchmarks/ddpg/half_cheetah_ddpg.png b/benchmarks/ddpg/half_cheetah_ddpg.png new file mode 100644 index 0000000..9b6689f Binary files /dev/null and b/benchmarks/ddpg/half_cheetah_ddpg.png differ diff --git a/benchmarks/ddpg/hopper_ddpg.png b/benchmarks/ddpg/hopper_ddpg.png new file mode 100644 index 0000000..18061be Binary files /dev/null and b/benchmarks/ddpg/hopper_ddpg.png differ diff --git a/benchmarks/ddpg/humanoid_ddpg.png b/benchmarks/ddpg/humanoid_ddpg.png new file mode 100644 index 0000000..ba73d2f Binary files /dev/null and b/benchmarks/ddpg/humanoid_ddpg.png differ diff --git a/benchmarks/ddpg/inverted_double_pendulum_ddpg.png b/benchmarks/ddpg/inverted_double_pendulum_ddpg.png new file mode 100644 index 0000000..519da9e Binary files /dev/null and b/benchmarks/ddpg/inverted_double_pendulum_ddpg.png differ diff --git a/benchmarks/ddpg/inverted_pendulum_ddpg.png b/benchmarks/ddpg/inverted_pendulum_ddpg.png new file mode 100644 index 0000000..bd064a8 Binary files /dev/null and b/benchmarks/ddpg/inverted_pendulum_ddpg.png differ diff --git a/benchmarks/ddpg/reacher_ddpg.png b/benchmarks/ddpg/reacher_ddpg.png new file mode 100644 index 0000000..114d9cd Binary files /dev/null and b/benchmarks/ddpg/reacher_ddpg.png differ diff --git a/benchmarks/ddpg/swimmer_ddpg.png b/benchmarks/ddpg/swimmer_ddpg.png new file mode 100644 index 0000000..3e04fd7 Binary files /dev/null and b/benchmarks/ddpg/swimmer_ddpg.png differ diff --git a/benchmarks/ddpg/walker2d_ddpg.png b/benchmarks/ddpg/walker2d_ddpg.png new file mode 100644 index 0000000..50efd3c Binary files /dev/null and b/benchmarks/ddpg/walker2d_ddpg.png differ diff --git a/benchmarks/ddpg_her/README.md b/benchmarks/ddpg_her/README.md new file mode 100644 index 0000000..6dfdc57 --- /dev/null +++ b/benchmarks/ddpg_her/README.md @@ -0,0 +1,40 @@ +# DDPG with Hindsight Experience Replay + +Each experiment uses 3 seeds. +The parameters used for DDPG HER are the same parameters as described in the [following paper](https://arxiv.org/abs/1802.09464). + +### Fetch Reach DDPG HER - single worker + +```bash +python3 coach.py -p Fetch_DDPG_HER_baselines -lvl reach +``` + +Fetch DDPG HER Reach 1 Worker + + +### Fetch Push DDPG HER - 8 workers + +```bash +python3 coach.py -p Fetch_DDPG_HER_baselines -lvl push -n 8 +``` + +Fetch DDPG HER Push 8 Worker + + +### Fetch Slide DDPG HER - 8 workers + +```bash +python3 coach.py -p Fetch_DDPG_HER_baselines -lvl slide -n 8 +``` + +Fetch DDPG HER Slide 8 Worker + + +### Fetch Pick And Place DDPG HER - 8 workers + +```bash +python3 coach.py -p Fetch_DDPG_HER -lvl pick_and_place -n 8 +``` + +Fetch DDPG HER Pick And Place 8 Workers + diff --git a/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png b/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png new file mode 100644 index 0000000..59b7138 Binary files /dev/null and b/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png differ diff --git a/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png b/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png new file mode 100644 index 0000000..8c088ad Binary files /dev/null and b/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png differ diff --git a/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png b/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png new file mode 100644 index 0000000..df0139c Binary files /dev/null and b/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png differ diff --git a/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png b/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png new file mode 100644 index 0000000..d3d7623 Binary files /dev/null and b/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png differ diff --git a/benchmarks/dfp/README.md b/benchmarks/dfp/README.md new file mode 100644 index 0000000..01ed6ae --- /dev/null +++ b/benchmarks/dfp/README.md @@ -0,0 +1,31 @@ +# DFP + +Each experiment uses 3 seeds. +The parameters used for DFP are the same parameters as described in the [original paper](https://arxiv.org/abs/1611.01779). + +### Doom Basic DFP - 8 workers + +```bash +python3 coach.py -p Doom_Basic_DFP -n 8 +``` + +Doom Basic DFP 8 workers + + +### Doom Health (D1: Basic) DFP - 8 workers + +```bash +python3 coach.py -p Doom_Health_DFP -n 8 +``` + +Doom Health DFP 8 workers + + + +### Doom Health Supreme (D2: Navigation) DFP - 8 workers + +```bash +python3 coach.py -p Doom_Health_Supreme_DFP -n 8 +``` + +Doom Health Supreme DFP 8 workers diff --git a/benchmarks/dfp/doom_basic_dfp_8_workers.png b/benchmarks/dfp/doom_basic_dfp_8_workers.png new file mode 100644 index 0000000..88369d5 Binary files /dev/null and b/benchmarks/dfp/doom_basic_dfp_8_workers.png differ diff --git a/benchmarks/dfp/doom_health_dfp_8_workers.png b/benchmarks/dfp/doom_health_dfp_8_workers.png new file mode 100644 index 0000000..bd448b3 Binary files /dev/null and b/benchmarks/dfp/doom_health_dfp_8_workers.png differ diff --git a/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png b/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png new file mode 100644 index 0000000..c22039a Binary files /dev/null and b/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png differ diff --git a/benchmarks/dqn/README.md b/benchmarks/dqn/README.md new file mode 100644 index 0000000..97f1c5c --- /dev/null +++ b/benchmarks/dqn/README.md @@ -0,0 +1,14 @@ +# DQN + +Each experiment uses 3 seeds. +The parameters used for DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1607.05077.pdf). + +### Breakout DQN - single worker + +```bash +python3 coach.py -p Atari_DQN -lvl breakout +``` + +Breakout DQN + + diff --git a/benchmarks/dqn/breakout_dqn.png b/benchmarks/dqn/breakout_dqn.png new file mode 100644 index 0000000..06fce22 Binary files /dev/null and b/benchmarks/dqn/breakout_dqn.png differ diff --git a/benchmarks/dueling_ddqn/README.md b/benchmarks/dueling_ddqn/README.md new file mode 100644 index 0000000..449e5af --- /dev/null +++ b/benchmarks/dueling_ddqn/README.md @@ -0,0 +1,14 @@ +# Dueling DDQN + +Each experiment uses 3 seeds and is trained for 10k environment steps. +The parameters used for Dueling DDQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1706.01502). + +### Breakout Dueling DDQN - single worker + +```bash +python3 coach.py -p Atari_Dueling_DDQN -lvl breakout +``` + +Breakout Dueling DDQN + + diff --git a/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png b/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png new file mode 100644 index 0000000..10fdd69 Binary files /dev/null and b/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png differ diff --git a/benchmarks/dueling_ddqn_with_per/README.md b/benchmarks/dueling_ddqn_with_per/README.md new file mode 100644 index 0000000..6cc83be --- /dev/null +++ b/benchmarks/dueling_ddqn_with_per/README.md @@ -0,0 +1,31 @@ +# Dueling DDQN with Prioritized Experience Replay + +Each experiment uses 3 seeds and is trained for 10k environment steps. +The parameters used for Dueling DDQN with PER are the same parameters as described in the [following paper](https://arxiv.org/abs/1511.05952). + +### Breakout Dueling DDQN with PER - single worker + +```bash +python3 coach.py -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl breakout +``` + +Breakout Dueling DDQN with PER + + +### Pong Dueling DDQN with PER - single worker + +```bash +python3 coach.py -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl pong +``` + +Pong Dueling DDQN with PER + + +### Space Invaders Dueling DDQN with PER - single worker + +```bash +python3 coach.py -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl space_invaders +``` + +Space Invaders Dueling DDQN with PER + diff --git a/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png b/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png new file mode 100644 index 0000000..b7df622 Binary files /dev/null and b/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png differ diff --git a/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png b/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png new file mode 100644 index 0000000..4f9ae2f Binary files /dev/null and b/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png differ diff --git a/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png b/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png new file mode 100644 index 0000000..8d577e1 Binary files /dev/null and b/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png differ diff --git a/benchmarks/img/Ant_A3C_16_workers.png b/benchmarks/img/Ant_A3C_16_workers.png deleted file mode 100644 index d677ab0..0000000 Binary files a/benchmarks/img/Ant_A3C_16_workers.png and /dev/null differ diff --git a/benchmarks/img/Breakout_A3C_16_workers.png b/benchmarks/img/Breakout_A3C_16_workers.png deleted file mode 100644 index 0f778e2..0000000 Binary files a/benchmarks/img/Breakout_A3C_16_workers.png and /dev/null differ diff --git a/benchmarks/img/CartPole_PG.png b/benchmarks/img/CartPole_PG.png deleted file mode 100644 index 46779dc..0000000 Binary files a/benchmarks/img/CartPole_PG.png and /dev/null differ diff --git a/benchmarks/img/Doom_Basic_DQN.png b/benchmarks/img/Doom_Basic_DQN.png deleted file mode 100644 index 5f9382f..0000000 Binary files a/benchmarks/img/Doom_Basic_DQN.png and /dev/null differ diff --git a/benchmarks/img/Doom_Basic_Dueling_DDQN.png b/benchmarks/img/Doom_Basic_Dueling_DDQN.png deleted file mode 100644 index 34478f7..0000000 Binary files a/benchmarks/img/Doom_Basic_Dueling_DDQN.png and /dev/null differ diff --git a/benchmarks/img/Doom_Basic_NEC.png b/benchmarks/img/Doom_Basic_NEC.png deleted file mode 100644 index 79b5c6f..0000000 Binary files a/benchmarks/img/Doom_Basic_NEC.png and /dev/null differ diff --git a/benchmarks/img/Doom_Health_DFP.png b/benchmarks/img/Doom_Health_DFP.png deleted file mode 100644 index 3f8e16c..0000000 Binary files a/benchmarks/img/Doom_Health_DFP.png and /dev/null differ diff --git a/benchmarks/img/Doom_Health_MMC.png b/benchmarks/img/Doom_Health_MMC.png deleted file mode 100644 index d43f66b..0000000 Binary files a/benchmarks/img/Doom_Health_MMC.png and /dev/null differ diff --git a/benchmarks/img/Hopper_A3C_16_workers.png b/benchmarks/img/Hopper_A3C_16_workers.png deleted file mode 100644 index 2c2efa7..0000000 Binary files a/benchmarks/img/Hopper_A3C_16_workers.png and /dev/null differ diff --git a/benchmarks/img/Hopper_ClippedPPO_16_workers.png b/benchmarks/img/Hopper_ClippedPPO_16_workers.png deleted file mode 100644 index e9821d9..0000000 Binary files a/benchmarks/img/Hopper_ClippedPPO_16_workers.png and /dev/null differ diff --git a/benchmarks/img/Humanoid_ClippedPPO_16_workers.png b/benchmarks/img/Humanoid_ClippedPPO_16_workers.png deleted file mode 100644 index 0488c98..0000000 Binary files a/benchmarks/img/Humanoid_ClippedPPO_16_workers.png and /dev/null differ diff --git a/benchmarks/img/InvertedPendulum_ClippedPPO_16_workers.png b/benchmarks/img/InvertedPendulum_ClippedPPO_16_workers.png deleted file mode 100644 index b563024..0000000 Binary files a/benchmarks/img/InvertedPendulum_ClippedPPO_16_workers.png and /dev/null differ diff --git a/benchmarks/img/InvertedPendulum_NAF.png b/benchmarks/img/InvertedPendulum_NAF.png deleted file mode 100644 index 9b8b6f6..0000000 Binary files a/benchmarks/img/InvertedPendulum_NAF.png and /dev/null differ diff --git a/benchmarks/img/Inverted_Pendulum_A3C_16_workers.png b/benchmarks/img/Inverted_Pendulum_A3C_16_workers.png deleted file mode 100644 index d459990..0000000 Binary files a/benchmarks/img/Inverted_Pendulum_A3C_16_workers.png and /dev/null differ diff --git a/benchmarks/img/Pendulum_DDPG.png b/benchmarks/img/Pendulum_DDPG.png deleted file mode 100644 index 89abbac..0000000 Binary files a/benchmarks/img/Pendulum_DDPG.png and /dev/null differ diff --git a/benchmarks/img/Pendulum_NAF.png b/benchmarks/img/Pendulum_NAF.png deleted file mode 100644 index 0faca93..0000000 Binary files a/benchmarks/img/Pendulum_NAF.png and /dev/null differ diff --git a/benchmarks/img/Pong_DQN.png b/benchmarks/img/Pong_DQN.png deleted file mode 100644 index 6122c78..0000000 Binary files a/benchmarks/img/Pong_DQN.png and /dev/null differ diff --git a/benchmarks/img/Pong_NEC.png b/benchmarks/img/Pong_NEC.png deleted file mode 100644 index 4148669..0000000 Binary files a/benchmarks/img/Pong_NEC.png and /dev/null differ diff --git a/benchmarks/qr_dqn/README.md b/benchmarks/qr_dqn/README.md new file mode 100644 index 0000000..e5f558c --- /dev/null +++ b/benchmarks/qr_dqn/README.md @@ -0,0 +1,21 @@ +# Quantile Regression DQN + +Each experiment uses 3 seeds and is trained for 10k environment steps. +The parameters used for QR-DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1710.10044.pdf). + +### Breakout QR-DQN - single worker + +```bash +python3 coach.py -p Atari_QR_DQN -lvl breakout +``` + +Breakout QR-DQN + + +### Pong QR-DQN - single worker + +```bash +python3 coach.py -p Atari_QR_DQN -lvl pong +``` + +Pong QR-DQN diff --git a/benchmarks/qr_dqn/breakout_qr_dqn.png b/benchmarks/qr_dqn/breakout_qr_dqn.png new file mode 100644 index 0000000..09b1c1c Binary files /dev/null and b/benchmarks/qr_dqn/breakout_qr_dqn.png differ diff --git a/benchmarks/qr_dqn/pong_qr_dqn.png b/benchmarks/qr_dqn/pong_qr_dqn.png new file mode 100644 index 0000000..8a39cfe Binary files /dev/null and b/benchmarks/qr_dqn/pong_qr_dqn.png differ diff --git a/coach.py b/coach.py deleted file mode 100644 index 8ba8cf3..0000000 --- a/coach.py +++ /dev/null @@ -1,333 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import sys, inspect, re -import os -import json -import presets -from presets import * -from utils import set_gpu, list_all_classes_in_module -from architectures import * -from environments import * -from agents import * -from utils import * -from logger import screen, logger -import argparse -from subprocess import Popen -import datetime -import presets -import atexit -import sys -import subprocess -from threading import Thread - -if len(set(failed_imports)) > 0: - screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) - - -def set_framework(framework_type): - # choosing neural network framework - framework = Frameworks().get(framework_type) - sess = None - if framework == Frameworks.TensorFlow: - import tensorflow as tf - config = tf.ConfigProto() - config.allow_soft_placement = True - config.gpu_options.allow_growth = True - config.gpu_options.per_process_gpu_memory_fraction = 0.2 - sess = tf.Session(config=config) - elif framework == Frameworks.Neon: - import ngraph as ng - sess = ng.transformers.make_transformer() - screen.log_title("Using {} framework".format(Frameworks().to_string(framework))) - return sess - - -def check_input_and_fill_run_dict(parser): - args = parser.parse_args() - - # if no arg is given - if len(sys.argv) == 1: - parser.print_help() - exit(0) - - # list available presets - if args.list: - presets_lists = list_all_classes_in_module(presets) - screen.log_title("Available Presets:") - for preset in presets_lists: - print(preset) - sys.exit(0) - - # check inputs - try: - # num_workers = int(args.num_workers) - num_workers = int(re.match("^\d+$", args.num_workers).group(0)) - except ValueError: - screen.error("Parameter num_workers should be an integer.") - - preset_names = list_all_classes_in_module(presets) - if args.preset is not None and args.preset not in preset_names: - screen.error("A non-existing preset was selected. ") - - if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir): - screen.error("The requested checkpoint folder to load from does not exist. ") - - if args.save_model_sec is not None: - try: - args.save_model_sec = int(args.save_model_sec) - except ValueError: - screen.error("Parameter save_model_sec should be an integer.") - - if args.preset is None and (args.agent_type is None or args.environment_type is None - or args.exploration_policy_type is None) and not args.play: - screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' - ' environment_type and exploration_policy_type to assemble a preset. ' - '\nAt least one of these parameters was not given.') - elif args.preset is None and args.play and args.environment_type is None: - screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' - ' the user is expected to input the desired environment_type and level.' - '\nAt least one of these parameters was not given.') - elif args.preset is None and args.play and args.environment_type: - args.agent_type = 'Human' - args.exploration_policy_type = 'ExplorationParameters' - - # get experiment name and path - experiment_name = logger.get_experiment_name(args.experiment_name) - experiment_path = logger.get_experiment_path(experiment_name) - - if args.play and num_workers > 1: - screen.warning("Playing the game as a human is only available with a single worker. " - "The number of workers will be reduced to 1") - num_workers = 1 - - # fill run_dict - run_dict = dict() - run_dict['agent_type'] = args.agent_type - run_dict['environment_type'] = args.environment_type - run_dict['exploration_policy_type'] = args.exploration_policy_type - run_dict['level'] = args.level - run_dict['preset'] = args.preset - run_dict['custom_parameter'] = args.custom_parameter - run_dict['experiment_path'] = experiment_path - run_dict['framework'] = Frameworks().get(args.framework) - run_dict['play'] = args.play - run_dict['evaluate'] = args.evaluate# or args.play - - # multi-threading parameters - run_dict['num_threads'] = num_workers - - # checkpoints - run_dict['save_model_sec'] = args.save_model_sec - run_dict['save_model_dir'] = experiment_path if args.save_model_sec is not None else None - run_dict['checkpoint_restore_dir'] = args.checkpoint_restore_dir - - # visualization - run_dict['visualization.dump_gifs'] = args.dump_gifs - run_dict['visualization.render'] = args.render - run_dict['visualization.tensorboard'] = args.tensorboard - - return args, run_dict - - -def run_dict_to_json(_run_dict, task_id=''): - if task_id != '': - json_path = os.path.join(_run_dict['experiment_path'], 'run_dict_worker{}.json'.format(task_id)) - else: - json_path = os.path.join(_run_dict['experiment_path'], 'run_dict.json') - - with open(json_path, 'w') as outfile: - json.dump(_run_dict, outfile, indent=2) - - return json_path - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-p', '--preset', - help="(string) Name of a preset to run (as configured in presets.py)", - default=None, - type=str) - parser.add_argument('-l', '--list', - help="(flag) List all available presets", - action='store_true') - parser.add_argument('-e', '--experiment_name', - help="(string) Experiment name to be used to store the results.", - default='', - type=str) - parser.add_argument('-r', '--render', - help="(flag) Render environment", - action='store_true') - parser.add_argument('-f', '--framework', - help="(string) Neural network framework. Available values: tensorflow, neon", - default='tensorflow', - type=str) - parser.add_argument('-n', '--num_workers', - help="(int) Number of workers for multi-process based agents, e.g. A3C", - default='1', - type=str) - parser.add_argument('--play', - help="(flag) Play as a human by controlling the game with the keyboard. " - "This option will save a replay buffer with the game play.", - action='store_true') - parser.add_argument('--evaluate', - help="(flag) Run evaluation only. This is a convenient way to disable " - "training in order to evaluate an existing checkpoint.", - action='store_true') - parser.add_argument('-v', '--verbose', - help="(flag) Don't suppress TensorFlow debug prints.", - action='store_true') - parser.add_argument('-s', '--save_model_sec', - help="(int) Time in seconds between saving checkpoints of the model.", - default=None, - type=int) - parser.add_argument('-crd', '--checkpoint_restore_dir', - help='(string) Path to a folder containing a checkpoint to restore the model from.', - type=str) - parser.add_argument('-dg', '--dump_gifs', - help="(flag) Enable the gif saving functionality.", - action='store_true') - parser.add_argument('-at', '--agent_type', - help="(string) Choose an agent type class to override on top of the selected preset. " - "If no preset is defined, a preset can be set from the command-line by combining settings " - "which are set by using --agent_type, --experiment_type, --environemnt_type", - default=None, - type=str) - parser.add_argument('-et', '--environment_type', - help="(string) Choose an environment type class to override on top of the selected preset." - "If no preset is defined, a preset can be set from the command-line by combining settings " - "which are set by using --agent_type, --experiment_type, --environemnt_type", - default=None, - type=str) - parser.add_argument('-ept', '--exploration_policy_type', - help="(string) Choose an exploration policy type class to override on top of the selected " - "preset." - "If no preset is defined, a preset can be set from the command-line by combining settings " - "which are set by using --agent_type, --experiment_type, --environemnt_type" - , - default=None, - type=str) - parser.add_argument('-lvl', '--level', - help="(string) Choose the level that will be played in the environment that was selected." - "This value will override the level parameter in the environment class." - , - default=None, - type=str) - parser.add_argument('-cp', '--custom_parameter', - help="(string) Semicolon separated parameters used to override specific parameters on top of" - " the selected preset (or on top of the command-line assembled one). " - "Whenever a parameter value is a string, it should be inputted as '\\\"string\\\"'. " - "For ex.: " - "\"visualization.render=False; num_training_iterations=500; optimizer='rmsprop'\"", - default=None, - type=str) - parser.add_argument('--print_parameters', - help="(flag) Print tuning_parameters to stdout", - action='store_true') - parser.add_argument('-tb', '--tensorboard', - help="(flag) When using the TensorFlow backend, enable TensorBoard log dumps. ", - action='store_true') - parser.add_argument('-ns', '--no_summary', - help="(flag) Prevent Coach from printing a summary and asking questions at the end of runs", - action='store_true') - - args, run_dict = check_input_and_fill_run_dict(parser) - - # turn TF debug prints off - if not args.verbose and args.framework.lower() == 'tensorflow': - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - # dump documentation - logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) - if not args.no_summary: - atexit.register(logger.summarize_experiment) - screen.change_terminal_title(logger.experiment_name) - - # Single-threaded runs - if run_dict['num_threads'] == 1: - # set tuning parameters - json_run_dict_path = run_dict_to_json(run_dict) - tuning_parameters = json_to_preset(json_run_dict_path) - tuning_parameters.sess = set_framework(args.framework) - - if args.print_parameters: - print('tuning_parameters', tuning_parameters) - - # Single-thread runs - tuning_parameters.task_index = 0 - env_instance = create_environment(tuning_parameters) - agent = eval(tuning_parameters.agent.type + '(env_instance, tuning_parameters)') - - # Start the training or evaluation - if tuning_parameters.evaluate: - agent.evaluate(sys.maxsize, keep_networks_synced=True) # evaluate forever - else: - agent.improve() - - # Multi-threaded runs - else: - assert args.framework.lower() == 'tensorflow', "Distributed training works only with TensorFlow" - os.environ["OMP_NUM_THREADS"]="1" - # set parameter server and workers addresses - ps_hosts = "localhost:{}".format(get_open_port()) - worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(run_dict['num_threads'] + 1)]) - - # Make sure to disable GPU so that all the workers will use the CPU - set_cpu() - - # create a parameter server - cmd = [ - "python3", - "./parallel_actor.py", - "--ps_hosts={}".format(ps_hosts), - "--worker_hosts={}".format(worker_hosts), - "--job_name=ps", - ] - parameter_server = Popen(cmd) - - screen.log_title("*** Distributed Training ***") - time.sleep(1) - - # create N training workers and 1 evaluating worker - workers = [] - - for i in range(run_dict['num_threads'] + 1): - # this is the evaluation worker - run_dict['task_id'] = i - if i == run_dict['num_threads']: - run_dict['evaluate_only'] = True - run_dict['visualization.render'] = args.render - else: - run_dict['evaluate_only'] = False - run_dict['visualization.render'] = False # #In a parallel setting, only the evaluation agent renders - - json_run_dict_path = run_dict_to_json(run_dict, i) - workers_args = ["python3", "./parallel_actor.py", - "--ps_hosts={}".format(ps_hosts), - "--worker_hosts={}".format(worker_hosts), - "--job_name=worker", - "--load_json={}".format(json_run_dict_path)] - - p = Popen(workers_args) - - if i != run_dict['num_threads']: - workers.append(p) - else: - evaluation_worker = p - - # wait for all workers - [w.wait() for w in workers] - evaluation_worker.kill() diff --git a/configurations.py b/configurations.py deleted file mode 100644 index a235c6c..0000000 --- a/configurations.py +++ /dev/null @@ -1,628 +0,0 @@ -# -# Copyright (c) 2017 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from utils import Enum -import json -import types - - -class Frameworks(Enum): - TensorFlow = 1 - Neon = 2 - - -class InputTypes(object): - Observation = 1 - Measurements = 2 - GoalVector = 3 - Action = 4 - TimedObservation = 5 - - -class OutputTypes(object): - Q = 1 - DuelingQ = 2 - V = 3 - Pi = 4 - MeasurementsPrediction = 5 - DNDQ = 6 - NAF = 7 - PPO = 8 - PPO_V = 9 - CategoricalQ = 10 - QuantileRegressionQ = 11 - - - -class EmbedderDepth(object): - Shallow = 1 - Deep = 2 - - -class EmbedderWidth(object): - Narrow = 1 - Wide = 2 - - -class MiddlewareTypes(object): - LSTM = 1 - FC = 2 - - -class Parameters(object): - def __str__(self): - parameters = {} - for k, v in self.__dict__.items(): - if isinstance(v, type) and issubclass(v, Parameters): - # v.__dict__ doesn't return a dictionary but a mappingproxy - # which json doesn't serialize, so convert it into a normal - # dictionary - parameters[k] = dict(v.__dict__.items()) - elif isinstance(v, types.MappingProxyType): - parameters[k] = dict(v.items()) - else: - parameters[k] = v - - return json.dumps(parameters, indent=4, default=repr) - - -class AgentParameters(Parameters): - agent = '' - - # Architecture parameters - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - middleware_type = MiddlewareTypes.FC - loss_weights = [1.0] - stop_gradients_from_head = [False] - embedder_depth = EmbedderDepth.Shallow - embedder_width = EmbedderWidth.Wide - num_output_head_copies = 1 - use_measurements = False - use_accumulated_reward_as_measurement = False - add_a_normalized_timestep_to_the_observation = False - l2_regularization = 0 - hidden_layers_activation_function = 'relu' - optimizer_type = 'Adam' - async_training = False - use_separate_networks_per_head = False - - # Agent parameters - num_consecutive_playing_steps = 1 - num_consecutive_training_steps = 1 - update_evaluation_agent_network_after_every_num_steps = 3000 - bootstrap_total_return_from_old_policy = False - n_step = -1 - num_episodes_in_experience_replay = 200 - num_transitions_in_experience_replay = None - discount = 0.99 - policy_gradient_rescaler = 'A_VALUE' - apply_gradients_every_x_episodes = 5 - beta_entropy = 0 - num_steps_between_gradient_updates = 20000 # t_max - num_steps_between_copying_online_weights_to_target = 1000 - rate_for_copying_weights_to_target = 1.0 - monte_carlo_mixing_rate = 0.1 - gae_lambda = 0.96 - step_until_collecting_full_episodes = False - targets_horizon = 'N-Step' - replace_mse_with_huber_loss = False - load_memory_from_file_path = None - collect_new_data = True - input_rescaler = 255.0 - - # PPO related params - target_kl_divergence = 0.01 - initial_kl_coefficient = 1.0 - high_kl_penalty_coefficient = 1000 - value_targets_mix_fraction = 0.1 - clip_likelihood_ratio_using_epsilon = None - use_kl_regularization = True - estimate_value_using_gae = False - - # DFP related params - num_predicted_steps_ahead = 6 - goal_vector = [1.0, 1.0] - future_measurements_weights = [0.5, 0.5, 1.0] - - # NEC related params - dnd_size = 500000 - l2_norm_added_delta = 0.001 - new_value_shift_coefficient = 0.1 - number_of_knn = 50 - DND_key_error_threshold = 0.01 - - # Framework support - neon_support = False - tensorflow_support = True - - # distributed agents params - shared_optimizer = True - share_statistics_between_workers = True - - -class EnvironmentParameters(Parameters): - type = 'Doom' - level = 'basic' - observation_stack_size = 4 - frame_skip = 4 - desired_observation_width = 76 - desired_observation_height = 60 - normalize_observation = False - crop_observation = False - random_initialization_steps = 0 - reward_scaling = 1.0 - reward_clipping_min = None - reward_clipping_max = None - human_control = False - - -class ExplorationParameters(Parameters): - # Exploration policies - policy = 'EGreedy' - evaluation_policy = 'Greedy' - # -- bootstrap dqn parameters - bootstrapped_data_sharing_probability = 0.5 - architecture_num_q_heads = 1 - # -- dropout approximation of thompson sampling parameters - dropout_discard_probability = 0 - initial_keep_probability = 0.0 # unused - final_keep_probability = 0.99 # unused - keep_probability_decay_steps = 50000 # unused - # -- epsilon greedy parameters - initial_epsilon = 0.5 - final_epsilon = 0.01 - epsilon_decay_steps = 50000 - evaluation_epsilon = 0.05 - # -- epsilon greedy at end of episode parameters - average_episode_length_over_num_episodes = 20 - # -- boltzmann softmax parameters - initial_temperature = 100.0 - final_temperature = 1.0 - temperature_decay_steps = 50000 - # -- additive noise - initial_noise_variance_percentage = 0.1 - final_noise_variance_percentage = 0.1 - noise_variance_decay_steps = 1 - # -- Ornstein-Uhlenbeck process - mu = 0 - theta = 0.15 - sigma = 0.3 - dt = 0.01 - - -class GeneralParameters(Parameters): - train = True - framework = Frameworks.TensorFlow - threads = 1 - sess = None - - # distributed training options - num_threads = 1 - synchronize_over_num_threads = 1 - distributed = False - - # Agent blocks - memory = 'EpisodicExperienceReplay' - architecture = 'GeneralTensorFlowNetwork' - - # General parameters - clip_gradients = None - kl_divergence_constraint = 100000 - num_training_iterations = 10000000000 - num_heatup_steps = 1000 - heatup_using_network_decisions = False - batch_size = 32 - save_model_sec = None - save_model_dir = None - checkpoint_restore_dir = None - learning_rate = 0.00025 - learning_rate_decay_rate = 0 - learning_rate_decay_steps = 0 - evaluation_episodes = 5 - evaluate_every_x_episodes = 1000000 - evaluate_every_x_training_iterations = 0 - rescaling_interpolation_type = 'bilinear' - current_episode = 0 - - # setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in - # the form of different workers starting at different times, and getting different assignments of CPU - # time from the OS. - seed = None - - checkpoints_path = '' - - # Testing parameters - test = False - test_min_return_threshold = 0 - test_max_step_threshold = 1 - test_num_workers = 1 - - -class VisualizationParameters(Parameters): - # Visualization parameters - record_video_every = 1000 - video_path = '/home/llt_lab/temp/breakout-videos' - plot_action_values_online = False - show_saliency_maps_every_num_episodes = 1000000000 - render_observation = False - print_summary = False - dump_csv = True - dump_signals_to_csv_every_x_episodes = 5 - render = False - dump_gifs = True - max_fps_for_human_control = 10 - tensorboard = False - - -class Roboschool(EnvironmentParameters): - type = 'Gym' - frame_skip = 1 - observation_stack_size = 1 - desired_observation_height = None - desired_observation_width = None - - -class GymVectorObservation(EnvironmentParameters): - type = 'Gym' - frame_skip = 1 - observation_stack_size = 1 - desired_observation_height = None - desired_observation_width = None - - -class Bullet(EnvironmentParameters): - type = 'Bullet' - frame_skip = 1 - observation_stack_size = 1 - desired_observation_height = None - desired_observation_width = None - - -class Atari(EnvironmentParameters): - type = 'Gym' - frame_skip = 4 - observation_stack_size = 4 - desired_observation_height = 84 - desired_observation_width = 84 - reward_clipping_max = 1.0 - reward_clipping_min = -1.0 - random_initialization_steps = 30 - crop_observation = False # in the original paper the observation is cropped but not in the Nature paper - - -class Doom(EnvironmentParameters): - type = 'Doom' - frame_skip = 4 - observation_stack_size = 3 - desired_observation_height = 60 - desired_observation_width = 76 - - -class Carla(EnvironmentParameters): - type = 'Carla' - frame_skip = 1 - observation_stack_size = 4 - desired_observation_height = 128 - desired_observation_width = 180 - normalize_observation = False - server_height = 256 - server_width = 360 - config = 'environments/CarlaSettings.ini' - level = 'town1' - verbose = True - stereo = False - semantic_segmentation = False - depth = False - episode_max_time = 100000 # miliseconds for each episode - continuous_to_bool_threshold = 0.5 - allow_braking = False - - -class Human(AgentParameters): - type = 'HumanAgent' - num_episodes_in_experience_replay = 10000000 - - -class NStepQ(AgentParameters): - type = 'NStepQAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - loss_weights = [1.0] - optimizer_type = 'Adam' - num_steps_between_copying_online_weights_to_target = 1000 - num_episodes_in_experience_replay = 2 - apply_gradients_every_x_episodes = 1 - num_steps_between_gradient_updates = 20 # this is called t_max in all the papers - hidden_layers_activation_function = 'elu' - targets_horizon = 'N-Step' - async_training = True - shared_optimizer = True - - -class DQN(AgentParameters): - type = 'DQNAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - loss_weights = [1.0] - optimizer_type = 'Adam' - num_steps_between_copying_online_weights_to_target = 1000 - neon_support = True - async_training = True - shared_optimizer = True - - -class DDQN(DQN): - type = 'DDQNAgent' - num_steps_between_copying_online_weights_to_target = 30000 - - -class DuelingDQN(DQN): - type = 'DQNAgent' - output_types = [OutputTypes.DuelingQ] - - -class BootstrappedDQN(DQN): - type = 'BootstrappedDQNAgent' - num_output_head_copies = 10 - - -class CategoricalDQN(DQN): - type = 'CategoricalDQNAgent' - output_types = [OutputTypes.CategoricalQ] - v_min = -10.0 - v_max = 10.0 - atoms = 51 - neon_support = False - - -class QuantileRegressionDQN(DQN): - type = 'QuantileRegressionDQNAgent' - output_types = [OutputTypes.QuantileRegressionQ] - atoms = 51 - - -class NEC(AgentParameters): - type = 'NECAgent' - optimizer_type = 'Adam' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.DNDQ] - loss_weights = [1.0] - dnd_size = 500000 - l2_norm_added_delta = 0.001 - new_value_shift_coefficient = 0.1 # alpha - number_of_knn = 50 - n_step = 100 - bootstrap_total_return_from_old_policy = True - DND_key_error_threshold = 0 - input_rescaler = 1.0 - num_consecutive_playing_steps = 4 - - -class ActorCritic(AgentParameters): - type = 'ActorCriticAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.V, OutputTypes.Pi] - loss_weights = [0.5, 1.0] - stop_gradients_from_head = [False, False] - num_episodes_in_experience_replay = 2 - policy_gradient_rescaler = 'A_VALUE' - hidden_layers_activation_function = 'elu' - apply_gradients_every_x_episodes = 5 - beta_entropy = 0 - num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers - gae_lambda = 0.96 - shared_optimizer = True - estimate_value_using_gae = False - async_training = True - - -class PolicyGradient(AgentParameters): - type = 'PolicyGradientsAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Pi] - loss_weights = [1.0] - num_episodes_in_experience_replay = 2 - policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP' - apply_gradients_every_x_episodes = 5 - beta_entropy = 0 - num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers - async_training = True - - -class DDPG(AgentParameters): - type = 'DDPGAgent' - input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action} - output_types = [OutputTypes.V] # V is used because we only want a single Q value - loss_weights = [1.0] - hidden_layers_activation_function = 'relu' - num_episodes_in_experience_replay = 10000 - num_steps_between_copying_online_weights_to_target = 1 - rate_for_copying_weights_to_target = 0.001 - shared_optimizer = True - async_training = True - - -class DDDPG(AgentParameters): - type = 'DDPGAgent' - input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action} - output_types = [OutputTypes.V] # V is used because we only want a single Q value - loss_weights = [1.0] - hidden_layers_activation_function = 'relu' - num_episodes_in_experience_replay = 10000 - num_steps_between_copying_online_weights_to_target = 10 - rate_for_copying_weights_to_target = 1 - shared_optimizer = True - async_training = True - - -class NAF(AgentParameters): - type = 'NAFAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.NAF] - loss_weights = [1.0] - hidden_layers_activation_function = 'tanh' - num_consecutive_training_steps = 5 - num_steps_between_copying_online_weights_to_target = 1 - rate_for_copying_weights_to_target = 0.001 - optimizer_type = 'RMSProp' - async_training = True - - -class PPO(AgentParameters): - type = 'PPOAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.V] - loss_weights = [1.0] - hidden_layers_activation_function = 'tanh' - num_episodes_in_experience_replay = 1000000 - policy_gradient_rescaler = 'A_VALUE' - gae_lambda = 0.96 - target_kl_divergence = 0.01 - initial_kl_coefficient = 1.0 - high_kl_penalty_coefficient = 1000 - add_a_normalized_timestep_to_the_observation = True - l2_regularization = 0#1e-3 - value_targets_mix_fraction = 0.1 - async_training = True - estimate_value_using_gae = True - step_until_collecting_full_episodes = True - - -class ClippedPPO(AgentParameters): - type = 'ClippedPPOAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.V, OutputTypes.PPO] - loss_weights = [0.5, 1.0] - stop_gradients_from_head = [False, False] - hidden_layers_activation_function = 'tanh' - num_episodes_in_experience_replay = 1000000 - policy_gradient_rescaler = 'GAE' - gae_lambda = 0.95 - target_kl_divergence = 0.01 - initial_kl_coefficient = 1.0 - high_kl_penalty_coefficient = 1000 - add_a_normalized_timestep_to_the_observation = False - l2_regularization = 1e-3 - value_targets_mix_fraction = 0.1 - clip_likelihood_ratio_using_epsilon = 0.2 - async_training = False - use_kl_regularization = False - estimate_value_using_gae = True - batch_size = 64 - use_separate_networks_per_head = True - step_until_collecting_full_episodes = True - beta_entropy = 0.01 - - -class DFP(AgentParameters): - type = 'DFPAgent' - input_types = { - 'observation': InputTypes.Observation, - 'measurements': InputTypes.Measurements, - 'goal': InputTypes.GoalVector - } - output_types = [OutputTypes.MeasurementsPrediction] - loss_weights = [1.0] - use_measurements = True - num_predicted_steps_ahead = 6 - goal_vector = [1.0, 1.0] - future_measurements_weights = [0.5, 0.5, 1.0] - async_training = True - - -class MMC(AgentParameters): - type = 'MixedMonteCarloAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - loss_weights = [1.0] - num_steps_between_copying_online_weights_to_target = 1000 - monte_carlo_mixing_rate = 0.1 - neon_support = True - - -class PAL(AgentParameters): - type = 'PALAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - loss_weights = [1.0] - pal_alpha = 0.9 - persistent_advantage_learning = False - num_steps_between_copying_online_weights_to_target = 1000 - neon_support = True - - -class BC(AgentParameters): - type = 'BCAgent' - input_types = {'observation': InputTypes.Observation} - output_types = [OutputTypes.Q] - loss_weights = [1.0] - collect_new_data = False - evaluate_every_x_training_iterations = 50000 - - -class EGreedyExploration(ExplorationParameters): - policy = 'EGreedy' - initial_epsilon = 0.5 - final_epsilon = 0.01 - epsilon_decay_steps = 50000 - evaluation_epsilon = 0.05 - initial_noise_variance_percentage = 0.1 - final_noise_variance_percentage = 0.1 - noise_variance_decay_steps = 50000 - - -class BootstrappedDQNExploration(ExplorationParameters): - policy = 'Bootstrapped' - architecture_num_q_heads = 10 - bootstrapped_data_sharing_probability = 0.1 - - -class OUExploration(ExplorationParameters): - policy = 'OUProcess' - mu = 0 - theta = 0.15 - sigma = 0.3 - dt = 0.01 - - -class AdditiveNoiseExploration(ExplorationParameters): - policy = 'AdditiveNoise' - initial_noise_variance_percentage = 0.1 - final_noise_variance_percentage = 0.1 - noise_variance_decay_steps = 50000 - - -class EntropyExploration(ExplorationParameters): - policy = 'ContinuousEntropy' - - -class CategoricalExploration(ExplorationParameters): - policy = 'Categorical' - - -class Preset(GeneralParameters): - def __init__(self, agent, env, exploration, visualization=VisualizationParameters): - """ - :type agent: AgentParameters - :type env: EnvironmentParameters - :type exploration: ExplorationParameters - :type visualization: VisualizationParameters - """ - self.visualization = visualization - self.agent = agent - self.env = env - self.exploration = exploration diff --git a/dashboard_components/boards.py b/dashboard_components/boards.py deleted file mode 100644 index dcbb5a3..0000000 --- a/dashboard_components/boards.py +++ /dev/null @@ -1,18 +0,0 @@ -from bokeh.layouts import column -from bokeh.models.widgets import Panel, Tabs -from dashboard_components.experiment_board import experiment_board_layout -from dashboard_components.globals import spinner, layouts -from bokeh.models.widgets import Div - -# ---------------- Build Website Layout ------------------- - -# title -title = Div(text="""

Coach Dashboard

""") - -tab1 = Panel(child=experiment_board_layout, title='experiment board') -tabs = Tabs(tabs=[tab1]) - -layout = column(title, tabs) -layout = column(layout, spinner) - -layouts['boards'] = layout diff --git a/dashboard_components/signals_file.py b/dashboard_components/signals_file.py deleted file mode 100644 index 1e89e18..0000000 --- a/dashboard_components/signals_file.py +++ /dev/null @@ -1,39 +0,0 @@ -import os - -import pandas as pd -from pandas.errors import EmptyDataError - -from dashboard_components.signals_file_base import SignalsFileBase -from utils import break_file_path - - -class SignalsFile(SignalsFileBase): - def __init__(self, csv_path, load=True, plot=None): - super().__init__(plot) - self.full_csv_path = csv_path - self.dir, self.filename, _ = break_file_path(csv_path) - if load: - self.load() - # this helps set the correct x axis - self.change_averaging_window(1, force=True) - - def load_csv(self): - # load csv and fix sparse data. - # csv can be in the middle of being written so we use try - except - self.csv = None - while self.csv is None: - try: - self.csv = pd.read_csv(self.full_csv_path) - break - except EmptyDataError: - self.csv = None - continue - self.csv = self.csv.interpolate() - self.csv.fillna(value=0, inplace=True) - - self.csv['Wall-Clock Time'] /= 60. - - self.last_modified = os.path.getmtime(self.full_csv_path) - - def file_was_modified_on_disk(self): - return self.last_modified != os.path.getmtime(self.full_csv_path) \ No newline at end of file diff --git a/docs/404.html b/docs/404.html new file mode 100644 index 0000000..0779c3a --- /dev/null +++ b/docs/404.html @@ -0,0 +1,244 @@ + + + + + + + + + + + Reinforcement Learning Coach + + + + + + + + + + + + + + + +
+ + + + +
+ + + + + +
+
+
+
    +
  • Docs »
  • + + +
  • + +
  • +
+
+
+
+
+ + +

404

+ +

Page not found

+ + +
+
+ + +
+
+ +
+ +
+ +
+ + + + + +
+ + + + + + + + diff --git a/architectures/neon_components/__init__.py b/docs/__init__.py similarity index 100% rename from architectures/neon_components/__init__.py rename to docs/__init__.py diff --git a/docs/algorithms/imitation/bc/index.html b/docs/algorithms/imitation/bc/index.html index c185dec..cb972a2 100644 --- a/docs/algorithms/imitation/bc/index.html +++ b/docs/algorithms/imitation/bc/index.html @@ -3,33 +3,29 @@ + - Behavioral Cloning - Reinforcement Learning Coach Documentation - - - + Behavioral Cloning - Reinforcement Learning Coach - - - - - + @@ -40,7 +36,7 @@