mirror of
https://github.com/gryf/coach.git
synced 2026-02-10 18:45:51 +01:00
coach v0.8.0
This commit is contained in:
34
agents/__init__.py
Normal file
34
agents/__init__.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from agents.agent import *
|
||||
from agents.bootstrapped_dqn_agent import *
|
||||
from agents.clipped_ppo_agent import *
|
||||
from agents.ddpg_agent import *
|
||||
from agents.ddqn_agent import *
|
||||
from agents.dfp_agent import *
|
||||
from agents.dqn_agent import *
|
||||
from agents.distributional_dqn_agent import *
|
||||
from agents.mmc_agent import *
|
||||
from agents.n_step_q_agent import *
|
||||
from agents.naf_agent import *
|
||||
from agents.nec_agent import *
|
||||
from agents.pal_agent import *
|
||||
from agents.policy_gradients_agent import *
|
||||
from agents.policy_optimization_agent import *
|
||||
from agents.ppo_agent import *
|
||||
from agents.value_optimization_agent import *
|
||||
136
agents/actor_critic_agent.py
Normal file
136
agents/actor_critic_agent.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.policy_optimization_agent import *
|
||||
from logger import *
|
||||
from utils import *
|
||||
import scipy.signal
|
||||
|
||||
|
||||
# Actor Critic - https://arxiv.org/abs/1602.01783
|
||||
class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
|
||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.action_advantages = Signal('Advantages')
|
||||
self.state_values = Signal('Values')
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.signals.append(self.action_advantages)
|
||||
self.signals.append(self.state_values)
|
||||
self.signals.append(self.unclipped_grads)
|
||||
|
||||
# Discounting function used to calculate discounted returns.
|
||||
def discount(self, x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
def get_general_advantage_estimation_values(self, rewards, values):
|
||||
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
||||
bootstrap_extended_rewards = np.array(rewards.tolist() + [values[-1]])
|
||||
|
||||
# Approximation based calculation of GAE (mathematically correct only when Tmax = inf,
|
||||
# although in practice works even in much smaller Tmax values, e.g. 20)
|
||||
deltas = rewards + self.tp.agent.discount * values[1:] - values[:-1]
|
||||
gae = self.discount(deltas, self.tp.agent.discount * self.tp.agent.gae_lambda)
|
||||
|
||||
if self.tp.agent.estimate_value_using_gae:
|
||||
discounted_returns = np.expand_dims(gae + values[:-1], -1)
|
||||
else:
|
||||
discounted_returns = np.expand_dims(np.array(self.discount(bootstrap_extended_rewards,
|
||||
self.tp.agent.discount)), 1)[:-1]
|
||||
return gae, discounted_returns
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# get the values for the current states
|
||||
result = self.main_network.online_network.predict(current_states)
|
||||
current_state_values = result[0]
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# the targets for the state value estimator
|
||||
num_transitions = len(game_overs)
|
||||
state_value_head_targets = np.zeros((num_transitions, 1))
|
||||
|
||||
# estimate the advantage function
|
||||
action_advantages = np.zeros((num_transitions, 1))
|
||||
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if game_overs[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0]
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
R = rewards[i] + self.tp.agent.discount * R
|
||||
state_value_head_targets[i] = R
|
||||
action_advantages[i] = R - current_state_values[i]
|
||||
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
bootstrapped_value = self.main_network.online_network.predict(np.expand_dims(next_states[-1], 0))[0]
|
||||
values = np.append(current_state_values, bootstrapped_value)
|
||||
if game_overs[-1]:
|
||||
values[-1] = 0
|
||||
|
||||
# get general discounted returns table
|
||||
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
|
||||
action_advantages = np.vstack(gae_values)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
action_advantages = action_advantages.squeeze(axis=-1)
|
||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# train
|
||||
result = self.main_network.online_network.accumulate_gradients([current_states, actions],
|
||||
[state_value_head_targets, action_advantages])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.action_advantages.add_sample(action_advantages)
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
logger.create_signal_value('Value Loss', losses[0])
|
||||
logger.create_signal_value('Policy Loss', losses[1])
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
state_value, action_probabilities = self.main_network.online_network.predict(observation)
|
||||
action_probabilities = action_probabilities.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_probabilities)
|
||||
else:
|
||||
action = np.argmax(action_probabilities)
|
||||
action_info = {"action_probability": action_probabilities[action], "state_value": state_value}
|
||||
self.entropy.add_sample(-np.sum(action_probabilities * np.log(action_probabilities)))
|
||||
else:
|
||||
# CONTINUOUS
|
||||
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(observation)
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
else:
|
||||
action = action_values_mean
|
||||
action_info = {"action_probability": action, "state_value": state_value}
|
||||
|
||||
return action, action_info
|
||||
536
agents/agent.py
Normal file
536
agents/agent.py
Normal file
@@ -0,0 +1,536 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import scipy.ndimage
|
||||
import matplotlib.pyplot as plt
|
||||
import copy
|
||||
from configurations import Preset
|
||||
from collections import OrderedDict
|
||||
from utils import RunPhase, Signal, is_empty, RunningStat
|
||||
from architectures import *
|
||||
from exploration_policies import *
|
||||
from memories import *
|
||||
from memories.memory import *
|
||||
from logger import logger, screen
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
import itertools
|
||||
from architectures.tensorflow_components.shared_variables import SharedRunningStats
|
||||
from six.moves import range
|
||||
|
||||
|
||||
class Agent:
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0):
|
||||
"""
|
||||
:param env: An environment instance
|
||||
:type env: EnvironmentWrapper
|
||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||
:type tuning_parameters: Preset
|
||||
:param replicated_device: A tensorflow device for distributed training (optional)
|
||||
:type replicated_device: instancemethod
|
||||
:param thread_id: The current thread id
|
||||
:param thread_id: int
|
||||
"""
|
||||
|
||||
screen.log_title("Creating agent {}".format(task_id))
|
||||
self.task_id = task_id
|
||||
self.sess = tuning_parameters.sess
|
||||
self.env = tuning_parameters.env_instance = env
|
||||
|
||||
# i/o dimensions
|
||||
if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
|
||||
tuning_parameters.env.desired_observation_width = self.env.width
|
||||
tuning_parameters.env.desired_observation_height = self.env.height
|
||||
self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size
|
||||
self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size
|
||||
if tuning_parameters.agent.use_accumulated_reward_as_measurement:
|
||||
self.measurements_size = tuning_parameters.env.measurements_size = (self.measurements_size[0] + 1,)
|
||||
|
||||
# modules
|
||||
self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
|
||||
# self.architecture = eval(tuning_parameters.architecture)
|
||||
|
||||
self.has_global = replicated_device is not None
|
||||
self.replicated_device = replicated_device
|
||||
self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
|
||||
|
||||
self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)')
|
||||
self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy
|
||||
+ '(tuning_parameters)')
|
||||
self.evaluation_exploration_policy.change_phase(RunPhase.TEST)
|
||||
|
||||
# initialize all internal variables
|
||||
self.tp = tuning_parameters
|
||||
self.in_heatup = False
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.total_steps_counter = 0
|
||||
self.running_reward = None
|
||||
self.training_iteration = 0
|
||||
self.current_episode = 0
|
||||
self.curr_state = []
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
self.last_episode_evaluation_ran = 0
|
||||
self.running_observations = []
|
||||
logger.set_current_time(self.current_episode)
|
||||
self.main_network = None
|
||||
self.networks = []
|
||||
self.last_episode_images = []
|
||||
|
||||
# signals
|
||||
self.signals = []
|
||||
self.loss = Signal('Loss')
|
||||
self.signals.append(self.loss)
|
||||
self.curr_learning_rate = Signal('Learning Rate')
|
||||
self.signals.append(self.curr_learning_rate)
|
||||
|
||||
if self.tp.env.normalize_observation and not self.env.is_state_type_image:
|
||||
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
|
||||
self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,))
|
||||
self.running_reward_stats = RunningStat(())
|
||||
else:
|
||||
self.running_observation_stats = SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(self.tp.env.desired_observation_width,),
|
||||
name='observation_stats')
|
||||
self.running_reward_stats = SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(),
|
||||
name='reward_stats')
|
||||
|
||||
# env is already reset at this point. Otherwise we're getting an error where you cannot
|
||||
# reset an env which is not done
|
||||
self.reset_game(do_not_reset_env=True)
|
||||
|
||||
# use seed
|
||||
if self.tp.seed is not None:
|
||||
random.seed(self.tp.seed)
|
||||
np.random.seed(self.tp.seed)
|
||||
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
if self.current_episode > 0:
|
||||
if phase == RunPhase.TEST:
|
||||
exploration = self.evaluation_exploration_policy.get_control_param()
|
||||
else:
|
||||
exploration = self.exploration_policy.get_control_param()
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Worker", self.task_id),
|
||||
("Episode", self.current_episode),
|
||||
("total reward", self.total_reward_in_current_episode),
|
||||
("exploration", exploration),
|
||||
("steps", self.total_steps_counter),
|
||||
("training iteration", self.training_iteration)
|
||||
]),
|
||||
prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
|
||||
)
|
||||
|
||||
def update_log(self, phase=RunPhase.TRAIN):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
logger.set_current_time(self.current_episode)
|
||||
logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
logger.create_signal_value('In Heatup', int(self.in_heatup))
|
||||
logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
|
||||
logger.create_signal_value('ER #Episodes', self.memory.length())
|
||||
logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||
logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||
if phase == RunPhase.TRAIN:
|
||||
logger.create_signal_value("Training Reward", self.total_reward_in_current_episode)
|
||||
elif phase == RunPhase.TEST:
|
||||
logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode)
|
||||
logger.update_wall_clock_time(self.current_episode)
|
||||
|
||||
for signal in self.signals:
|
||||
logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||
logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||
|
||||
# dump
|
||||
if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0:
|
||||
logger.dump_output_csv()
|
||||
|
||||
def reset_game(self, do_not_reset_env=False):
|
||||
"""
|
||||
Resets all the episodic parameters and start a new environment episode.
|
||||
:param do_not_reset_env: A boolean that allows prevention of environment reset
|
||||
:return: None
|
||||
"""
|
||||
|
||||
for signal in self.signals:
|
||||
signal.reset()
|
||||
self.total_reward_in_current_episode = 0
|
||||
self.curr_state = []
|
||||
self.last_episode_images = []
|
||||
self.current_episode_steps_counter = 0
|
||||
self.episode_running_info = {}
|
||||
if not do_not_reset_env:
|
||||
self.env.reset()
|
||||
self.exploration_policy.reset()
|
||||
|
||||
# required for online plotting
|
||||
if self.tp.visualization.plot_action_values_online:
|
||||
if hasattr(self, 'episode_running_info') and hasattr(self.env, 'actions_description'):
|
||||
for action in self.env.actions_description:
|
||||
self.episode_running_info[action] = []
|
||||
plt.clf()
|
||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
||||
for network in self.networks:
|
||||
network.curr_rnn_c_in = network.middleware_embedder.c_init
|
||||
network.curr_rnn_h_in = network.middleware_embedder.h_init
|
||||
|
||||
def stack_observation(self, curr_stack, observation):
|
||||
"""
|
||||
Adds a new observation to an existing stack of observations from previous time-steps.
|
||||
:param curr_stack: The current observations stack.
|
||||
:param observation: The new observation
|
||||
:return: The updated observation stack
|
||||
"""
|
||||
|
||||
if curr_stack == []:
|
||||
# starting an episode
|
||||
curr_stack = np.vstack(np.expand_dims([observation] * self.tp.env.observation_stack_size, 0))
|
||||
curr_stack = self.switch_axes_order(curr_stack, from_type='channels_first', to_type='channels_last')
|
||||
else:
|
||||
curr_stack = np.append(curr_stack, np.expand_dims(np.squeeze(observation), axis=-1), axis=-1)
|
||||
curr_stack = np.delete(curr_stack, 0, -1)
|
||||
|
||||
return curr_stack
|
||||
|
||||
def preprocess_observation(self, observation):
|
||||
"""
|
||||
Preprocesses the given observation.
|
||||
For images - convert to grayscale, resize and convert to int.
|
||||
For measurements vectors - normalize by a running average and std.
|
||||
:param observation: The agents observation
|
||||
:return: A processed version of the observation
|
||||
"""
|
||||
|
||||
if self.env.is_state_type_image:
|
||||
# rescale
|
||||
observation = scipy.misc.imresize(observation,
|
||||
(self.tp.env.desired_observation_height,
|
||||
self.tp.env.desired_observation_width),
|
||||
interp=self.tp.rescaling_interpolation_type)
|
||||
# rgb to y
|
||||
if len(observation.shape) > 2 and observation.shape[2] > 1:
|
||||
r, g, b = observation[:, :, 0], observation[:, :, 1], observation[:, :, 2]
|
||||
observation = 0.2989 * r + 0.5870 * g + 0.1140 * b
|
||||
|
||||
return observation.astype('uint8')
|
||||
else:
|
||||
if self.tp.env.normalize_observation:
|
||||
# standardize the input observation using a running mean and std
|
||||
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
|
||||
self.running_observation_stats.push(observation)
|
||||
observation = (observation - self.running_observation_stats.mean) / \
|
||||
(self.running_observation_stats.std + 1e-15)
|
||||
observation = np.clip(observation, -5.0, 5.0)
|
||||
return observation
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
"""
|
||||
Given a batch of transitions, calculates their target values and updates the network.
|
||||
:param batch: A list of transitions
|
||||
:return: The loss of the training
|
||||
"""
|
||||
pass
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
A single training iteration. Sample a batch, train on it and update target networks.
|
||||
:return: The training loss.
|
||||
"""
|
||||
batch = self.memory.sample(self.tp.batch_size)
|
||||
loss = self.learn_from_batch(batch)
|
||||
|
||||
if self.tp.learning_rate_decay_rate != 0:
|
||||
self.curr_learning_rate.add_sample(self.tp.sess.run(self.tp.learning_rate))
|
||||
else:
|
||||
self.curr_learning_rate.add_sample(self.tp.learning_rate)
|
||||
|
||||
# update the target network of every network that has a target network
|
||||
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
|
||||
for network in self.networks:
|
||||
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
|
||||
logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return loss
|
||||
|
||||
def extract_batch(self, batch):
|
||||
"""
|
||||
Extracts a single numpy array for each object in a batch of transitions (state, action, etc.)
|
||||
:param batch: An array of transitions
|
||||
:return: For each transition element, returns a numpy array of all the transitions in the batch
|
||||
"""
|
||||
|
||||
current_observations = np.array([transition.state['observation'] for transition in batch])
|
||||
next_observations = np.array([transition.next_state['observation'] for transition in batch])
|
||||
actions = np.array([transition.action for transition in batch])
|
||||
rewards = np.array([transition.reward for transition in batch])
|
||||
game_overs = np.array([transition.game_over for transition in batch])
|
||||
total_return = np.array([transition.total_return for transition in batch])
|
||||
|
||||
current_states = current_observations
|
||||
next_states = next_observations
|
||||
|
||||
# get the entire state including measurements if available
|
||||
if self.tp.agent.use_measurements:
|
||||
current_measurements = np.array([transition.state['measurements'] for transition in batch])
|
||||
next_measurements = np.array([transition.next_state['measurements'] for transition in batch])
|
||||
current_states = [current_observations, current_measurements]
|
||||
next_states = [next_observations, next_measurements]
|
||||
|
||||
return current_states, next_states, actions, rewards, game_overs, total_return
|
||||
|
||||
def plot_action_values_online(self):
|
||||
"""
|
||||
Plot an animated graph of the value of each possible action during the episode
|
||||
:return: None
|
||||
"""
|
||||
|
||||
plt.clf()
|
||||
for key, data_list in self.episode_running_info.items():
|
||||
plt.plot(data_list, label=key)
|
||||
plt.legend()
|
||||
plt.pause(0.00000001)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
|
||||
:param curr_state: the current state to act upon.
|
||||
:param phase: the current phase: training or testing.
|
||||
:return: chosen action, some action value describing the action (q-value, probability, etc)
|
||||
"""
|
||||
pass
|
||||
|
||||
def preprocess_reward(self, reward):
|
||||
if self.tp.env.reward_scaling:
|
||||
reward /= float(self.tp.env.reward_scaling)
|
||||
if self.tp.env.reward_clipping_max:
|
||||
reward = min(reward, self.tp.env.reward_clipping_max)
|
||||
if self.tp.env.reward_clipping_min:
|
||||
reward = max(reward, self.tp.env.reward_clipping_min)
|
||||
return reward
|
||||
|
||||
def switch_axes_order(self, observation, from_type='channels_first', to_type='channels_last'):
|
||||
"""
|
||||
transpose an observation axes from channels_first to channels_last or vice versa
|
||||
:param observation: a numpy array
|
||||
:param from_type: can be 'channels_first' or 'channels_last'
|
||||
:param to_type: can be 'channels_first' or 'channels_last'
|
||||
:return: a new observation with the requested axes order
|
||||
"""
|
||||
if from_type == to_type:
|
||||
return
|
||||
assert 2 <= len(observation.shape) <= 3, 'num axes of an observation must be 2 for a vector or 3 for an image'
|
||||
assert type(observation) == np.ndarray, 'observation must be a numpy array'
|
||||
if len(observation.shape) == 3:
|
||||
if from_type == 'channels_first' and to_type == 'channels_last':
|
||||
return np.transpose(observation, (1, 2, 0))
|
||||
elif from_type == 'channels_last' and to_type == 'channels_first':
|
||||
return np.transpose(observation, (2, 0, 1))
|
||||
else:
|
||||
return np.transpose(observation, (1, 0))
|
||||
|
||||
def act(self, phase=RunPhase.TRAIN):
|
||||
"""
|
||||
Take one step in the environment according to the network prediction and store the transition in memory
|
||||
:param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
|
||||
:return: A boolean value that signals an episode termination
|
||||
"""
|
||||
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# get new action
|
||||
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0}
|
||||
is_first_transition_in_episode = (self.curr_state == [])
|
||||
if is_first_transition_in_episode:
|
||||
observation = self.preprocess_observation(self.env.observation)
|
||||
observation = self.stack_observation([], observation)
|
||||
|
||||
self.curr_state = {'observation': observation}
|
||||
if self.tp.agent.use_measurements:
|
||||
self.curr_state['measurements'] = self.env.measurements
|
||||
if self.tp.agent.use_accumulated_reward_as_measurement:
|
||||
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
|
||||
|
||||
if self.in_heatup: # we do not have a stacked curr_state yet
|
||||
action = self.env.get_random_action()
|
||||
else:
|
||||
action, action_info = self.choose_action(self.curr_state, phase=phase)
|
||||
|
||||
# perform action
|
||||
if type(action) == np.ndarray:
|
||||
action = action.squeeze()
|
||||
result = self.env.step(action)
|
||||
shaped_reward = self.preprocess_reward(result['reward'])
|
||||
if 'action_intrinsic_reward' in action_info.keys():
|
||||
shaped_reward += action_info['action_intrinsic_reward']
|
||||
self.total_reward_in_current_episode += result['reward']
|
||||
observation = self.preprocess_observation(result['observation'])
|
||||
|
||||
# plot action values online
|
||||
if self.tp.visualization.plot_action_values_online and not self.in_heatup:
|
||||
self.plot_action_values_online()
|
||||
|
||||
# initialize the next state
|
||||
observation = self.stack_observation(self.curr_state['observation'], observation)
|
||||
|
||||
next_state = {'observation': observation}
|
||||
if self.tp.agent.use_measurements and 'measurements' in result.keys():
|
||||
next_state['measurements'] = result['measurements']
|
||||
if self.tp.agent.use_accumulated_reward_as_measurement:
|
||||
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
|
||||
|
||||
# store the transition only if we are training
|
||||
if phase == RunPhase.TRAIN:
|
||||
transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
|
||||
for key in action_info.keys():
|
||||
transition.info[key] = action_info[key]
|
||||
if self.tp.agent.add_a_normalized_timestep_to_the_observation:
|
||||
transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
|
||||
self.memory.store(transition)
|
||||
elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
|
||||
# we store the transitions only for saving gifs
|
||||
self.last_episode_images.append(self.env.get_rendered_image())
|
||||
|
||||
# update the current state for the next step
|
||||
self.curr_state = next_state
|
||||
|
||||
# deal with episode termination
|
||||
if result['done']:
|
||||
if self.tp.visualization.dump_csv:
|
||||
self.update_log(phase=phase)
|
||||
self.log_to_screen(phase=phase)
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
self.reset_game()
|
||||
|
||||
self.current_episode += 1
|
||||
|
||||
# return episode really ended
|
||||
return result['done']
|
||||
|
||||
def evaluate(self, num_episodes, keep_networks_synced=False):
|
||||
"""
|
||||
Run in an evaluation mode for several episodes. Actions will be chosen greedily.
|
||||
:param keep_networks_synced: keep the online network in sync with the global network after every episode
|
||||
:param num_episodes: The number of episodes to evaluate on
|
||||
:return: None
|
||||
"""
|
||||
|
||||
max_reward_achieved = -float('inf')
|
||||
average_evaluation_reward = 0
|
||||
screen.log_title("Running evaluation")
|
||||
self.env.change_phase(RunPhase.TEST)
|
||||
for i in range(num_episodes):
|
||||
# keep the online network in sync with the global network
|
||||
if keep_networks_synced:
|
||||
for network in self.networks:
|
||||
network.sync()
|
||||
|
||||
episode_ended = False
|
||||
while not episode_ended:
|
||||
episode_ended = self.act(phase=RunPhase.TEST)
|
||||
|
||||
if self.tp.visualization.dump_gifs and self.total_reward_in_current_episode > max_reward_achieved:
|
||||
max_reward_achieved = self.total_reward_in_current_episode
|
||||
frame_skipping = int(5/self.tp.env.frame_skip)
|
||||
logger.create_gif(self.last_episode_images[::frame_skipping],
|
||||
name='score-{}'.format(max_reward_achieved), fps=10)
|
||||
|
||||
average_evaluation_reward += self.total_reward_in_current_episode
|
||||
self.reset_game()
|
||||
|
||||
average_evaluation_reward /= float(num_episodes)
|
||||
|
||||
self.env.change_phase(RunPhase.TRAIN)
|
||||
screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
|
||||
|
||||
def post_training_commands(self):
|
||||
pass
|
||||
|
||||
def improve(self):
|
||||
"""
|
||||
Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ]
|
||||
|
||||
:return: None
|
||||
"""
|
||||
|
||||
# synchronize the online network weights with the global network
|
||||
for network in self.networks:
|
||||
network.sync()
|
||||
|
||||
# heatup phase
|
||||
if self.tp.num_heatup_steps != 0:
|
||||
self.in_heatup = True
|
||||
screen.log_title("Starting heatup {}".format(self.task_id))
|
||||
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
|
||||
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
|
||||
self.act()
|
||||
|
||||
# training phase
|
||||
self.in_heatup = False
|
||||
screen.log_title("Starting training {}".format(self.task_id))
|
||||
self.exploration_policy.change_phase(RunPhase.TRAIN)
|
||||
training_start_time = time.time()
|
||||
model_snapshots_periods_passed = -1
|
||||
|
||||
while self.training_iteration < self.tp.num_training_iterations:
|
||||
# evaluate
|
||||
evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \
|
||||
(self.current_episode % self.tp.evaluate_every_x_episodes == 0)
|
||||
if evaluate_agent:
|
||||
self.last_episode_evaluation_ran = self.current_episode
|
||||
self.evaluate(self.tp.evaluation_episodes)
|
||||
|
||||
# snapshot model
|
||||
if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed:
|
||||
total_training_time = time.time() - training_start_time
|
||||
current_snapshot_period = (int(total_training_time) // self.tp.save_model_sec)
|
||||
if current_snapshot_period > model_snapshots_periods_passed:
|
||||
model_snapshots_periods_passed = current_snapshot_period
|
||||
self.main_network.save_model(model_snapshots_periods_passed)
|
||||
|
||||
# play and record in replay buffer
|
||||
if self.tp.agent.step_until_collecting_full_episodes:
|
||||
step = 0
|
||||
while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(-1).length() != 0:
|
||||
self.act()
|
||||
step += 1
|
||||
else:
|
||||
for step in range(self.tp.agent.num_consecutive_playing_steps):
|
||||
self.act()
|
||||
|
||||
# train
|
||||
if self.tp.train:
|
||||
for step in range(self.tp.agent.num_consecutive_training_steps):
|
||||
loss = self.train()
|
||||
self.loss.add_sample(loss)
|
||||
self.training_iteration += 1
|
||||
self.post_training_commands()
|
||||
|
||||
58
agents/bootstrapped_dqn_agent.py
Normal file
58
agents/bootstrapped_dqn_agent.py
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
||||
class BootstrappedDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def reset_game(self, do_not_reset_env=False):
|
||||
ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
||||
self.exploration_policy.select_head()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# for the action we actually took, the error is:
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
# for all other actions, the error is 0
|
||||
q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
# initialize with the current prediction so that we will
|
||||
TD_targets = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.tp.batch_size):
|
||||
mask = batch[i].info['mask']
|
||||
for head_idx in range(self.tp.exploration.architecture_num_q_heads):
|
||||
if mask[head_idx] == 1:
|
||||
TD_targets[head_idx][i, actions[i]] = rewards[i] + \
|
||||
(1.0 - game_overs[i]) * self.tp.agent.discount * np.max(
|
||||
q_st_plus_1[head_idx][i], 0)
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
def act(self, phase=RunPhase.TRAIN):
|
||||
ValueOptimizationAgent.act(self, phase)
|
||||
mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
|
||||
self.tp.exploration.architecture_num_q_heads)
|
||||
self.memory.update_last_transition_info({'mask': mask})
|
||||
210
agents/clipped_ppo_agent.py
Normal file
210
agents/clipped_ppo_agent.py
Normal file
@@ -0,0 +1,210 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from random import shuffle
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
||||
class ClippedPPOAgent(ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
# signals definition
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.signals.append(self.value_loss)
|
||||
self.policy_loss = Signal('Policy Loss')
|
||||
self.signals.append(self.policy_loss)
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.signals.append(self.unclipped_grads)
|
||||
self.value_targets = Signal('Value Targets')
|
||||
self.signals.append(self.value_targets)
|
||||
self.kl_divergence = Signal('KL Divergence')
|
||||
self.signals.append(self.kl_divergence)
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
|
||||
|
||||
current_state_values = self.main_network.online_network.predict([current_states])[0]
|
||||
current_state_values = current_state_values.squeeze()
|
||||
self.state_values.add_sample(current_state_values)
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
value_targets = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = total_return - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
value_targets = np.array([])
|
||||
for idx, game_over in enumerate(game_overs):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, gae_based_value_targets = \
|
||||
self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
value_targets = np.append(value_targets, gae_based_value_targets)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
for transition, advantage, value_target in zip(batch, advantages, value_targets):
|
||||
transition.info['advantage'] = advantage
|
||||
transition.info['gae_based_value_target'] = value_target
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_network(self, dataset, epochs):
|
||||
loss = []
|
||||
for j in range(epochs):
|
||||
loss = {
|
||||
'total_loss': [],
|
||||
'policy_losses': [],
|
||||
'unclipped_grads': [],
|
||||
'fetch_result': []
|
||||
}
|
||||
shuffle(dataset)
|
||||
for i in range(int(len(dataset) / self.tp.batch_size)):
|
||||
batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size]
|
||||
current_states, _, actions, _, _, total_return = self.extract_batch(batch)
|
||||
|
||||
advantages = np.array([t.info['advantage'] for t in batch])
|
||||
gae_based_value_targets = np.array([t.info['gae_based_value_target'] for t in batch])
|
||||
if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
result = self.main_network.target_network.predict([current_states])
|
||||
old_policy_distribution = result[1:]
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
fetches = [self.main_network.online_network.output_heads[1].kl_divergence,
|
||||
self.main_network.online_network.output_heads[1].entropy]
|
||||
|
||||
total_return = np.expand_dims(total_return, -1)
|
||||
value_targets = gae_based_value_targets if self.tp.agent.estimate_value_using_gae else total_return
|
||||
total_loss, policy_losses, unclipped_grads, fetch_result =\
|
||||
self.main_network.online_network.accumulate_gradients(
|
||||
[current_states] + [actions] + old_policy_distribution,
|
||||
[total_return, advantages], additional_fetches=fetches)
|
||||
|
||||
self.value_targets.add_sample(value_targets)
|
||||
if self.tp.distributed:
|
||||
self.main_network.apply_gradients_to_global_network()
|
||||
self.main_network.update_online_network()
|
||||
else:
|
||||
self.main_network.apply_gradients_to_online_network()
|
||||
|
||||
self.main_network.online_network.reset_accumulated_gradients()
|
||||
|
||||
loss['total_loss'].append(total_loss)
|
||||
loss['policy_losses'].append(policy_losses)
|
||||
loss['unclipped_grads'].append(unclipped_grads)
|
||||
loss['fetch_result'].append(fetch_result)
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
for key in loss.keys():
|
||||
loss[key] = np.mean(loss[key], 0)
|
||||
|
||||
if self.tp.learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.tp.learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
|
||||
self.entropy.add_sample(loss['fetch_result'][1])
|
||||
self.kl_divergence.add_sample(loss['fetch_result'][0])
|
||||
return policy_losses
|
||||
|
||||
def post_training_commands(self):
|
||||
|
||||
# clean memory
|
||||
self.memory.clean()
|
||||
|
||||
def train(self):
|
||||
self.main_network.sync()
|
||||
|
||||
dataset = self.memory.transitions
|
||||
|
||||
self.fill_advantages(dataset)
|
||||
|
||||
# take only the requested number of steps
|
||||
dataset = dataset[:self.tp.agent.num_consecutive_playing_steps]
|
||||
|
||||
if self.tp.distributed and self.tp.agent.share_statistics_between_workers:
|
||||
self.running_observation_stats.push(np.array([t.state['observation'] for t in dataset]))
|
||||
|
||||
losses = self.train_network(dataset, 10)
|
||||
self.value_loss.add_sample(losses[0])
|
||||
self.policy_loss.add_sample(losses[1])
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(losses[0], losses[1])
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = curr_state['observation']
|
||||
observation = np.expand_dims(np.array(observation), 0)
|
||||
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
_, action_values = self.main_network.online_network.predict(observation)
|
||||
action_values = action_values.squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
action_info = {"action_probability": action_values[action]}
|
||||
# self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
|
||||
else:
|
||||
# CONTINUOUS
|
||||
_, action_values_mean, action_values_std = self.main_network.online_network.predict(observation)
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
# if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
|
||||
# print action
|
||||
else:
|
||||
action = action_values_mean
|
||||
action_info = {"action_probability": action_values_mean}
|
||||
|
||||
return action, action_info
|
||||
104
agents/ddpg_agent.py
Normal file
104
agents/ddpg_agent.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from configurations import *
|
||||
|
||||
|
||||
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
||||
class DDPGAgent(ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
# define critic network
|
||||
self.critic_network = self.main_network
|
||||
# self.networks.append(self.critic_network)
|
||||
|
||||
# define actor network
|
||||
tuning_parameters.agent.input_types = [InputTypes.Observation]
|
||||
tuning_parameters.agent.output_types = [OutputTypes.Pi]
|
||||
self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.actor_network)
|
||||
|
||||
self.q_values = Signal("Q")
|
||||
self.signals.append(self.q_values)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
next_actions = self.actor_network.target_network.predict([next_states])
|
||||
q_st_plus_1 = self.critic_network.target_network.predict([next_states, next_actions])
|
||||
TD_targets = np.expand_dims(rewards, -1) + \
|
||||
(1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * q_st_plus_1
|
||||
|
||||
# get the gradients of the critic output with respect to the action
|
||||
actions_mean = self.actor_network.online_network.predict(current_states)
|
||||
critic_online_network = self.critic_network.online_network
|
||||
action_gradients = self.critic_network.sess.run(critic_online_network.gradients_wrt_inputs[1],
|
||||
feed_dict={
|
||||
critic_online_network.inputs[0]: current_states,
|
||||
critic_online_network.inputs[1]: actions_mean,
|
||||
})[0]
|
||||
|
||||
# train the critic
|
||||
if len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
result = self.critic_network.train_and_sync_networks([current_states, actions], TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
# apply the gradients from the critic to the actor
|
||||
actor_online_network = self.actor_network.online_network
|
||||
gradients = self.actor_network.sess.run(actor_online_network.weighted_gradients,
|
||||
feed_dict={
|
||||
actor_online_network.gradients_weights_ph: -action_gradients,
|
||||
actor_online_network.inputs[0]: current_states
|
||||
})
|
||||
if self.actor_network.has_global:
|
||||
self.actor_network.global_network.apply_gradients(gradients)
|
||||
self.actor_network.update_online_network()
|
||||
else:
|
||||
self.actor_network.online_network.apply_gradients(gradients)
|
||||
|
||||
return total_loss
|
||||
|
||||
def train(self):
|
||||
return Agent.train(self)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
result = self.actor_network.online_network.predict(observation)
|
||||
action_values = result[0].squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
|
||||
action = np.clip(action, self.env.action_space_low, self.env.action_space_high)
|
||||
|
||||
# get q value
|
||||
action_batch = np.expand_dims(action, 0)
|
||||
if type(action) != np.ndarray:
|
||||
action_batch = np.array([[action]])
|
||||
q_value = self.critic_network.online_network.predict([observation, action_batch])[0]
|
||||
self.q_values.add_sample(q_value)
|
||||
action_info = {"action_value": q_value}
|
||||
|
||||
return action, action_info
|
||||
42
agents/ddqn_agent.py
Normal file
42
agents/ddqn_agent.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Double DQN - https://arxiv.org/abs/1509.06461
|
||||
class DDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
|
||||
q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
TD_targets = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.tp.batch_size):
|
||||
TD_targets[i, actions[i]] = rewards[i] \
|
||||
+ (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][
|
||||
selected_actions[i]]
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
83
agents/dfp_agent.py
Normal file
83
agents/dfp_agent.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.agent import *
|
||||
|
||||
|
||||
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
||||
class DFPAgent(Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.current_goal = self.tp.agent.goal_vector
|
||||
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
|
||||
|
||||
# create the inputs for the network
|
||||
input = current_states
|
||||
input.append(np.repeat(np.expand_dims(self.current_goal, 0), self.tp.batch_size, 0))
|
||||
|
||||
# get the current outputs of the network
|
||||
targets = self.main_network.online_network.predict(input)
|
||||
|
||||
# change the targets for the taken actions
|
||||
for i in range(self.tp.batch_size):
|
||||
targets[i, actions[i]] = batch[i].info['future_measurements'].flatten()
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
|
||||
goal = np.expand_dims(self.current_goal, 0)
|
||||
|
||||
# predict the future measurements
|
||||
measurements_future_prediction = self.main_network.online_network.predict([observation, measurements, goal])[0]
|
||||
action_values = np.zeros((self.action_space_size,))
|
||||
num_steps_used_for_objective = len(self.tp.agent.future_measurements_weights)
|
||||
|
||||
# calculate the score of each action by multiplying it's future measurements with the goal vector
|
||||
for action_idx in range(self.action_space_size):
|
||||
action_measurements = measurements_future_prediction[action_idx]
|
||||
action_measurements = np.reshape(action_measurements,
|
||||
(self.tp.agent.num_predicted_steps_ahead, self.measurements_size[0]))
|
||||
future_steps_values = np.dot(action_measurements, self.current_goal)
|
||||
action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
|
||||
self.tp.agent.future_measurements_weights)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
|
||||
action_values = action_values.squeeze()
|
||||
|
||||
# store information for plotting interactively (actual plotting is done in agent)
|
||||
if self.tp.visualization.plot_action_values_online:
|
||||
for idx, action_name in enumerate(self.env.actions_description):
|
||||
self.episode_running_info[action_name].append(action_values[idx])
|
||||
|
||||
action_info = {"action_probability": 0, "action_value": action_values[action]}
|
||||
|
||||
return action, action_info
|
||||
60
agents/distributional_dqn_agent.py
Normal file
60
agents/distributional_dqn_agent.py
Normal file
@@ -0,0 +1,60 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||
class DistributionalDQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
def get_q_values(self, prediction):
|
||||
return np.dot(prediction, self.z_values)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# for the action we actually took, the error is calculated by the atoms distribution
|
||||
# for all other actions, the error is 0
|
||||
distributed_q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
# initialize with the current prediction so that we will
|
||||
TD_targets = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
target_actions = np.argmax(self.get_q_values(distributed_q_st_plus_1), axis=1)
|
||||
m = np.zeros((self.tp.batch_size, self.z_values.size))
|
||||
|
||||
batches = np.arange(self.tp.batch_size)
|
||||
for j in range(self.z_values.size):
|
||||
tzj = np.fmax(np.fmin(rewards + (1.0 - game_overs) * self.tp.agent.discount * self.z_values[j],
|
||||
self.z_values[self.z_values.size - 1]),
|
||||
self.z_values[0])
|
||||
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
|
||||
u = (np.ceil(bj)).astype(int)
|
||||
l = (np.floor(bj)).astype(int)
|
||||
m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
|
||||
m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
|
||||
# total_loss = cross entropy between actual result above and predicted result for the given action
|
||||
TD_targets[batches, actions] = m
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
43
agents/dqn_agent.py
Normal file
43
agents/dqn_agent.py
Normal file
@@ -0,0 +1,43 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
||||
class DQNAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# for the action we actually took, the error is:
|
||||
# TD error = r + discount*max(q_st_plus_1) - q_st
|
||||
# for all other actions, the error is 0
|
||||
q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
# initialize with the current prediction so that we will
|
||||
TD_targets = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.tp.batch_size):
|
||||
TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(
|
||||
q_st_plus_1[i], 0)
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
42
agents/mmc_agent.py
Normal file
42
agents/mmc_agent.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
class MixedMonteCarloAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
|
||||
|
||||
TD_targets = self.main_network.online_network.predict(current_states)
|
||||
selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
|
||||
q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
# initialize with the current prediction so that we will
|
||||
# only update the action that we have actually done in this transition
|
||||
for i in range(self.tp.batch_size):
|
||||
one_step_target = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * q_st_plus_1[i][
|
||||
selected_actions[i]]
|
||||
monte_carlo_target = total_return[i]
|
||||
TD_targets[i, actions[i]] = (1 - self.mixing_rate) * one_step_target + self.mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
85
agents/n_step_q_agent.py
Normal file
85
agents/n_step_q_agent.py
Normal file
@@ -0,0 +1,85 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents.policy_optimization_agent import *
|
||||
from logger import *
|
||||
from utils import *
|
||||
import scipy.signal
|
||||
|
||||
|
||||
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
||||
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.q_values = Signal('Q Values')
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.signals.append(self.q_values)
|
||||
self.signals.append(self.unclipped_grads)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# get the values for the current states
|
||||
state_value_head_targets = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# the targets for the state value estimator
|
||||
num_transitions = len(game_overs)
|
||||
|
||||
if self.tp.agent.targets_horizon == '1-Step':
|
||||
# 1-Step Q learning
|
||||
q_st_plus_1 = self.main_network.target_network.predict(next_states)
|
||||
|
||||
for i in reversed(xrange(num_transitions)):
|
||||
state_value_head_targets[i][actions[i]] = \
|
||||
rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0)
|
||||
|
||||
elif self.tp.agent.targets_horizon == 'N-Step':
|
||||
# N-Step Q learning
|
||||
if game_overs[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = np.max(self.main_network.target_network.predict(np.expand_dims(next_states[-1], 0)))
|
||||
|
||||
for i in reversed(xrange(num_transitions)):
|
||||
R = rewards[i] + self.tp.agent.discount * R
|
||||
state_value_head_targets[i][actions[i]] = R
|
||||
|
||||
else:
|
||||
assert True, 'The available values for targets_horizon are: 1-Step, N-Step'
|
||||
|
||||
# train
|
||||
result = self.main_network.online_network.accumulate_gradients([current_states], [state_value_head_targets])
|
||||
|
||||
# logging
|
||||
total_loss, losses, unclipped_grads = result[:3]
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
logger.create_signal_value('Value Loss', losses[0])
|
||||
|
||||
return total_loss
|
||||
|
||||
def train(self):
|
||||
# update the target network of every network that has a target network
|
||||
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
|
||||
for network in self.networks:
|
||||
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
|
||||
logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return PolicyOptimizationAgent.train(self)
|
||||
75
agents/naf_agent.py
Normal file
75
agents/naf_agent.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
||||
class NAFAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.l_values = Signal("L")
|
||||
self.a_values = Signal("Advantage")
|
||||
self.mu_values = Signal("Action")
|
||||
self.v_values = Signal("V")
|
||||
self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
# TD error = r + discount*v_st_plus_1 - q_st
|
||||
v_st_plus_1 = self.main_network.sess.run(self.main_network.target_network.output_heads[0].V,
|
||||
feed_dict={self.main_network.target_network.inputs[0]: next_states})
|
||||
TD_targets = np.expand_dims(rewards, -1) + (1.0 - np.expand_dims(game_overs, -1)) * self.tp.agent.discount * v_st_plus_1
|
||||
|
||||
if len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
result = self.main_network.train_and_sync_networks([current_states, actions], TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
|
||||
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
naf_head = self.main_network.online_network.output_heads[0]
|
||||
action_values = self.main_network.sess.run(naf_head.mu,
|
||||
feed_dict={self.main_network.online_network.inputs[0]: observation})
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
|
||||
Q, L, A, mu, V = self.main_network.sess.run(
|
||||
[naf_head.Q, naf_head.L, naf_head.A, naf_head.mu, naf_head.V],
|
||||
feed_dict={
|
||||
self.main_network.online_network.inputs[0]: observation,
|
||||
self.main_network.online_network.inputs[1]: action_values
|
||||
}
|
||||
)
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(Q)
|
||||
self.l_values.add_sample(L)
|
||||
self.a_values.add_sample(A)
|
||||
self.mu_values.add_sample(mu)
|
||||
self.v_values.add_sample(V)
|
||||
|
||||
action_value = {"action_value": Q}
|
||||
return action, action_value
|
||||
104
agents/nec_agent.py
Normal file
104
agents/nec_agent.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
||||
class NECAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=False)
|
||||
self.current_episode_state_embeddings = []
|
||||
self.current_episode_actions = []
|
||||
self.training_started = False
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
if not self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
|
||||
return 0
|
||||
else:
|
||||
if not self.training_started:
|
||||
self.training_started = True
|
||||
screen.log_title("Finished collecting initial entries in DND. Starting to train network...")
|
||||
|
||||
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
|
||||
result = self.main_network.train_and_sync_networks([current_states, actions], total_return)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
|
||||
# get embedding
|
||||
embedding = self.main_network.sess.run(self.main_network.online_network.state_embedding,
|
||||
feed_dict={self.main_network.online_network.inputs[0]: observation})
|
||||
self.current_episode_state_embeddings.append(embedding[0])
|
||||
|
||||
# get action values
|
||||
if self.main_network.online_network.output_heads[0].DND.has_enough_entries(self.tp.agent.number_of_knn):
|
||||
# if there are enough entries in the DND then we can query it to get the action values
|
||||
actions_q_values = []
|
||||
for action in range(self.action_space_size):
|
||||
feed_dict = {
|
||||
self.main_network.online_network.state_embedding: embedding,
|
||||
self.main_network.online_network.output_heads[0].input[0]: np.array([action])
|
||||
}
|
||||
q_value = self.main_network.sess.run(
|
||||
self.main_network.online_network.output_heads[0].output, feed_dict=feed_dict)
|
||||
actions_q_values.append(q_value[0])
|
||||
else:
|
||||
# get only the embedding so we can insert it to the DND
|
||||
actions_q_values = [0] * self.action_space_size
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(actions_q_values)
|
||||
self.current_episode_actions.append(action)
|
||||
else:
|
||||
action = np.argmax(actions_q_values)
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(actions_q_values)
|
||||
|
||||
# store information for plotting interactively (actual plotting is done in agent)
|
||||
if self.tp.visualization.plot_action_values_online:
|
||||
for idx, action_name in enumerate(self.env.actions_description):
|
||||
self.episode_running_info[action_name].append(actions_q_values[idx])
|
||||
|
||||
action_value = {"action_value": actions_q_values[action]}
|
||||
return action, action_value
|
||||
|
||||
def reset_game(self, do_not_reset_env=False):
|
||||
ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
||||
|
||||
# make sure we already have at least one episode
|
||||
if self.memory.num_complete_episodes() >= 1 and not self.in_heatup:
|
||||
# get the last full episode that we have collected
|
||||
episode = self.memory.get(-2)
|
||||
returns = []
|
||||
for i in range(episode.length()):
|
||||
returns.append(episode.get_transition(i).total_return)
|
||||
# Just to deal with the end of heatup where there might be a case where it ends in a middle
|
||||
# of an episode, and thus when getting the episode out of the ER, it will be a complete one whereas
|
||||
# the other statistics collected here, are collected only during training.
|
||||
returns = returns[-len(self.current_episode_actions):]
|
||||
self.main_network.online_network.output_heads[0].DND.add(self.current_episode_state_embeddings,
|
||||
self.current_episode_actions, returns)
|
||||
|
||||
self.current_episode_state_embeddings = []
|
||||
self.current_episode_actions = []
|
||||
65
agents/pal_agent.py
Normal file
65
agents/pal_agent.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
|
||||
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
||||
class PALAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.alpha = tuning_parameters.agent.pal_alpha
|
||||
self.persistent = tuning_parameters.agent.persistent_advantage_learning
|
||||
self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
|
||||
|
||||
selected_actions = np.argmax(self.main_network.online_network.predict(next_states), 1)
|
||||
|
||||
# next state values
|
||||
q_st_plus_1_target = self.main_network.target_network.predict(next_states)
|
||||
v_st_plus_1_target = np.max(q_st_plus_1_target, 1)
|
||||
|
||||
# current state values according to online network
|
||||
q_st_online = self.main_network.online_network.predict(current_states)
|
||||
|
||||
# current state values according to target network
|
||||
q_st_target = self.main_network.target_network.predict(current_states)
|
||||
v_st_target = np.max(q_st_target, 1)
|
||||
|
||||
# calculate TD error
|
||||
TD_targets = np.copy(q_st_online)
|
||||
for i in range(self.tp.batch_size):
|
||||
TD_targets[i, actions[i]] = rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * \
|
||||
q_st_plus_1_target[i][selected_actions[i]]
|
||||
advantage_learning_update = v_st_target[i] - q_st_target[i, actions[i]]
|
||||
next_advantage_learning_update = v_st_plus_1_target[i] - q_st_plus_1_target[i, selected_actions[i]]
|
||||
# Persistent Advantage Learning or Regular Advantage Learning
|
||||
if self.persistent:
|
||||
TD_targets[i, actions[i]] -= self.alpha * min(advantage_learning_update, next_advantage_learning_update)
|
||||
else:
|
||||
TD_targets[i, actions[i]] -= self.alpha * advantage_learning_update
|
||||
|
||||
# mixing monte carlo updates
|
||||
monte_carlo_target = total_return[i]
|
||||
TD_targets[i, actions[i]] = (1 - self.monte_carlo_mixing_rate) * TD_targets[i, actions[i]] \
|
||||
+ self.monte_carlo_mixing_rate * monte_carlo_target
|
||||
|
||||
result = self.main_network.train_and_sync_networks(current_states, TD_targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
87
agents/policy_gradients_agent.py
Normal file
87
agents/policy_gradients_agent.py
Normal file
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.policy_optimization_agent import *
|
||||
import numpy as np
|
||||
from logger import *
|
||||
import tensorflow as tf
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from utils import *
|
||||
|
||||
|
||||
class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
# batch contains a list of episodes to learn from
|
||||
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
|
||||
|
||||
for i in reversed(range(len(total_returns))):
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
|
||||
total_returns[i] = total_returns[0]
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
|
||||
# just take the total return as it is
|
||||
pass
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
||||
if self.std_discounted_return != 0:
|
||||
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
||||
else:
|
||||
total_returns[i] = 0
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
targets = total_returns
|
||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
logger.create_signal_value('Returns Variance', np.std(total_returns), self.task_id)
|
||||
logger.create_signal_value('Returns Mean', np.mean(total_returns), self.task_id)
|
||||
|
||||
result = self.main_network.online_network.accumulate_gradients([current_states, actions], targets)
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
action_values = self.main_network.online_network.predict(observation).squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
action_value = {"action_probability": action_values[action]}
|
||||
self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
|
||||
else:
|
||||
# CONTINUOUS
|
||||
result = self.main_network.online_network.predict(observation)
|
||||
action_values = result[0].squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
action_value = {}
|
||||
|
||||
return action, action_value
|
||||
121
agents/policy_optimization_agent.py
Normal file
121
agents/policy_optimization_agent.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.agent import *
|
||||
from memories.memory import Episode
|
||||
|
||||
|
||||
class PolicyGradientRescaler(Enum):
|
||||
TOTAL_RETURN = 0
|
||||
FUTURE_RETURN = 1
|
||||
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
||||
FUTURE_RETURN_NORMALIZED_BY_TIMESTEP = 3 # baselined
|
||||
Q_VALUE = 4
|
||||
A_VALUE = 5
|
||||
TD_RESIDUAL = 6
|
||||
DISCOUNTED_TD_RESIDUAL = 7
|
||||
GAE = 8
|
||||
|
||||
|
||||
class PolicyOptimizationAgent(Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler)
|
||||
|
||||
# statistics for variance reduction
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.max_episode_length = 100000
|
||||
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
||||
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
||||
self.entropy = Signal('Entropy')
|
||||
self.signals.append(self.entropy)
|
||||
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
if self.current_episode > 0:
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Worker", self.task_id),
|
||||
("Episode", self.current_episode),
|
||||
("total reward", self.total_reward_in_current_episode),
|
||||
("steps", self.total_steps_counter),
|
||||
("training iteration", self.training_iteration)
|
||||
]),
|
||||
prefix="Heatup" if self.in_heatup else "Training" if phase == RunPhase.TRAIN else "Testing"
|
||||
)
|
||||
|
||||
def update_episode_statistics(self, episode):
|
||||
episode_discounted_returns = []
|
||||
for i in range(episode.length()):
|
||||
transition = episode.get_transition(i)
|
||||
episode_discounted_returns.append(transition.total_return)
|
||||
self.num_episodes_where_step_has_been_seen[i] += 1
|
||||
self.mean_return_over_multiple_episodes[i] -= self.mean_return_over_multiple_episodes[i] / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_return_over_multiple_episodes[i] += transition.total_return / \
|
||||
self.num_episodes_where_step_has_been_seen[i]
|
||||
self.mean_discounted_return = np.mean(episode_discounted_returns)
|
||||
self.std_discounted_return = np.std(episode_discounted_returns)
|
||||
|
||||
def train(self):
|
||||
if self.memory.length() == 0:
|
||||
return 0
|
||||
|
||||
episode = self.memory.get_episode(0)
|
||||
|
||||
# check if we should calculate gradients or skip
|
||||
episode_ended = self.memory.num_complete_episodes() >= 1
|
||||
num_steps_passed_since_last_update = episode.length() - self.last_gradient_update_step_idx
|
||||
is_t_max_steps_passed = num_steps_passed_since_last_update >= self.tp.agent.num_steps_between_gradient_updates
|
||||
if not (is_t_max_steps_passed or episode_ended):
|
||||
return 0
|
||||
|
||||
total_loss = 0
|
||||
if num_steps_passed_since_last_update > 0:
|
||||
|
||||
# we need to update the returns of the episode until now
|
||||
episode.update_returns(self.tp.agent.discount)
|
||||
|
||||
# get t_max transitions or less if the we got to a terminal state
|
||||
# will be used for both actor-critic and vanilla PG.
|
||||
# # In order to get full episodes, Vanilla PG will set the end_idx to a very big value.
|
||||
transitions = []
|
||||
start_idx = self.last_gradient_update_step_idx
|
||||
end_idx = episode.length()
|
||||
|
||||
for idx in range(start_idx, end_idx):
|
||||
transitions.append(episode.get_transition(idx))
|
||||
self.last_gradient_update_step_idx = end_idx
|
||||
|
||||
# update the statistics for the variance reduction techniques
|
||||
if self.tp.agent.type == 'PolicyGradientsAgent':
|
||||
self.update_episode_statistics(episode)
|
||||
|
||||
# accumulate the gradients and apply them once in every apply_gradients_every_x_episodes episodes
|
||||
total_loss = self.learn_from_batch(transitions)
|
||||
if self.current_episode % self.tp.agent.apply_gradients_every_x_episodes == 0:
|
||||
self.main_network.apply_gradients_and_sync_networks()
|
||||
|
||||
# move the pointer to the next episode start and discard the episode. we use it only once
|
||||
if episode_ended:
|
||||
self.memory.remove_episode(0)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
|
||||
return total_loss
|
||||
274
agents/ppo_agent.py
Normal file
274
agents/ppo_agent.py
Normal file
@@ -0,0 +1,274 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from random import shuffle
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.02286.pdf
|
||||
class PPOAgent(ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
self.critic_network = self.main_network
|
||||
|
||||
# define the policy network
|
||||
tuning_parameters.agent.input_types = [InputTypes.Observation]
|
||||
tuning_parameters.agent.output_types = [OutputTypes.PPO]
|
||||
tuning_parameters.agent.optimizer_type = 'Adam'
|
||||
tuning_parameters.agent.l2_regularization = 0
|
||||
self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.policy_network)
|
||||
|
||||
# operations for changing the kl coefficient
|
||||
self.kl_coefficient = tf.placeholder('float', name='kl_coefficient')
|
||||
self.increase_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
|
||||
self.kl_coefficient * 1.5)
|
||||
self.decrease_kl_coefficient = tf.assign(self.policy_network.online_network.output_heads[0].kl_coefficient,
|
||||
self.kl_coefficient / 1.5)
|
||||
|
||||
# signals definition
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.signals.append(self.value_loss)
|
||||
self.policy_loss = Signal('Policy Loss')
|
||||
self.signals.append(self.policy_loss)
|
||||
self.kl_divergence = Signal('KL Divergence')
|
||||
self.signals.append(self.kl_divergence)
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.signals.append(self.unclipped_grads)
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, total_return = self.extract_batch(batch)
|
||||
|
||||
# * Found not to have any impact *
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(batch)
|
||||
|
||||
current_state_values = self.critic_network.online_network.predict([current_states]).squeeze()
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
advantages = total_return - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
# current_state_values[game_overs] = 0
|
||||
for idx, game_over in enumerate(game_overs):
|
||||
if game_over:
|
||||
# get advantages for the rollout
|
||||
value_bootstrapping = np.zeros((1,))
|
||||
rollout_state_values = np.append(current_state_values[episode_start_idx:idx+1], value_bootstrapping)
|
||||
|
||||
rollout_advantages, _ = \
|
||||
self.get_general_advantage_estimation_values(rewards[episode_start_idx:idx+1],
|
||||
rollout_state_values)
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
|
||||
for transition, advantage in zip(self.memory.transitions, advantages):
|
||||
transition.info['advantage'] = advantage
|
||||
|
||||
self.action_advantages.add_sample(advantages)
|
||||
|
||||
def train_value_network(self, dataset, epochs):
|
||||
loss = []
|
||||
current_states, _, _, _, _, total_return = self.extract_batch(dataset)
|
||||
|
||||
# * Found not to have any impact *
|
||||
# add a timestep to the observation
|
||||
# current_states_with_timestep = self.concat_state_and_timestep(dataset)
|
||||
|
||||
total_return = np.expand_dims(total_return, -1)
|
||||
mix_fraction = self.tp.agent.value_targets_mix_fraction
|
||||
for j in range(epochs):
|
||||
batch_size = len(dataset)
|
||||
if self.critic_network.online_network.optimizer_type != 'LBFGS':
|
||||
batch_size = self.tp.batch_size
|
||||
for i in range(len(dataset) // batch_size):
|
||||
# split to batches for first order optimization techniques
|
||||
current_states_batch = current_states[i * batch_size:(i + 1) * batch_size]
|
||||
total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
|
||||
old_policy_values = force_list(self.critic_network.target_network.predict(
|
||||
[current_states_batch]).squeeze())
|
||||
if self.critic_network.online_network.optimizer_type != 'LBFGS':
|
||||
targets = total_return_batch
|
||||
else:
|
||||
current_values = self.critic_network.online_network.predict([current_states_batch])
|
||||
targets = current_values * (1 - mix_fraction) + total_return_batch * mix_fraction
|
||||
|
||||
value_loss = self.critic_network.online_network.\
|
||||
accumulate_gradients([current_states_batch] + old_policy_values, targets)
|
||||
self.critic_network.apply_gradients_to_online_network()
|
||||
if self.tp.distributed:
|
||||
self.critic_network.apply_gradients_to_global_network()
|
||||
self.critic_network.online_network.reset_accumulated_gradients()
|
||||
|
||||
loss.append([value_loss[0]])
|
||||
loss = np.mean(loss, 0)
|
||||
return loss
|
||||
|
||||
def concat_state_and_timestep(self, dataset):
|
||||
current_states_with_timestep = [np.append(transition.state['observation'], transition.info['timestep'])
|
||||
for transition in dataset]
|
||||
current_states_with_timestep = np.expand_dims(current_states_with_timestep, -1)
|
||||
return current_states_with_timestep
|
||||
|
||||
def train_policy_network(self, dataset, epochs):
|
||||
loss = []
|
||||
for j in range(epochs):
|
||||
loss = {
|
||||
'total_loss': [],
|
||||
'policy_losses': [],
|
||||
'unclipped_grads': [],
|
||||
'fetch_result': []
|
||||
}
|
||||
#shuffle(dataset)
|
||||
for i in range(len(dataset) // self.tp.batch_size):
|
||||
batch = dataset[i * self.tp.batch_size:(i + 1) * self.tp.batch_size]
|
||||
current_states, _, actions, _, _, total_return = self.extract_batch(batch)
|
||||
advantages = np.array([t.info['advantage'] for t in batch])
|
||||
if not self.tp.env_instance.discrete_controls and len(actions.shape) == 1:
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
old_policy = force_list(self.policy_network.target_network.predict([current_states]))
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
|
||||
self.policy_network.online_network.output_heads[0].entropy]
|
||||
|
||||
total_loss, policy_losses, unclipped_grads, fetch_result =\
|
||||
self.policy_network.online_network.accumulate_gradients(
|
||||
[current_states, actions] + old_policy, [advantages], additional_fetches=fetches)
|
||||
|
||||
self.policy_network.apply_gradients_to_online_network()
|
||||
if self.tp.distributed:
|
||||
self.policy_network.apply_gradients_to_global_network()
|
||||
|
||||
self.policy_network.online_network.reset_accumulated_gradients()
|
||||
|
||||
loss['total_loss'].append(total_loss)
|
||||
loss['policy_losses'].append(policy_losses)
|
||||
loss['unclipped_grads'].append(unclipped_grads)
|
||||
loss['fetch_result'].append(fetch_result)
|
||||
|
||||
self.unclipped_grads.add_sample(unclipped_grads)
|
||||
|
||||
for key in loss.keys():
|
||||
loss[key] = np.mean(loss[key], 0)
|
||||
|
||||
if self.tp.learning_rate_decay_rate != 0:
|
||||
curr_learning_rate = self.tp.sess.run(self.tp.learning_rate)
|
||||
self.curr_learning_rate.add_sample(curr_learning_rate)
|
||||
else:
|
||||
curr_learning_rate = self.tp.learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
("training epoch", j),
|
||||
("learning_rate", curr_learning_rate)
|
||||
]),
|
||||
prefix="Policy training"
|
||||
)
|
||||
|
||||
self.total_kl_divergence_during_training_process = loss['fetch_result'][0]
|
||||
self.entropy.add_sample(loss['fetch_result'][1])
|
||||
self.kl_divergence.add_sample(loss['fetch_result'][0])
|
||||
return loss['total_loss']
|
||||
|
||||
def update_kl_coefficient(self):
|
||||
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
||||
# his implementation for now because we know it works well
|
||||
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||
|
||||
# update kl coefficient
|
||||
kl_target = self.tp.agent.target_kl_divergence
|
||||
kl_coefficient = self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)
|
||||
if self.total_kl_divergence_during_training_process > 1.3 * kl_target:
|
||||
# kl too high => increase regularization
|
||||
self.tp.sess.run(self.increase_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
|
||||
elif self.total_kl_divergence_during_training_process < 0.7 * kl_target:
|
||||
# kl too low => decrease regularization
|
||||
self.tp.sess.run(self.decrease_kl_coefficient, feed_dict={self.kl_coefficient: kl_coefficient})
|
||||
screen.log_title("KL penalty coefficient change = {} -> {}".format(
|
||||
kl_coefficient, self.tp.sess.run(self.policy_network.online_network.output_heads[0].kl_coefficient)))
|
||||
|
||||
def post_training_commands(self):
|
||||
if self.tp.agent.use_kl_regularization:
|
||||
self.update_kl_coefficient()
|
||||
|
||||
# clean memory
|
||||
self.memory.clean()
|
||||
|
||||
def train(self):
|
||||
self.policy_network.sync()
|
||||
self.critic_network.sync()
|
||||
|
||||
dataset = self.memory.transitions
|
||||
|
||||
self.fill_advantages(dataset)
|
||||
|
||||
# take only the requested number of steps
|
||||
dataset = dataset[:self.tp.agent.num_consecutive_playing_steps]
|
||||
|
||||
value_loss = self.train_value_network(dataset, 1)
|
||||
policy_loss = self.train_policy_network(dataset, 10)
|
||||
|
||||
self.value_loss.add_sample(value_loss)
|
||||
self.policy_loss.add_sample(policy_loss)
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(value_loss, policy_loss)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = curr_state['observation']
|
||||
observation = np.expand_dims(np.array(observation), 0)
|
||||
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
action_values = self.policy_network.online_network.predict(observation).squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
action_info = {"action_probability": action_values[action]}
|
||||
# self.entropy.add_sample(-np.sum(action_values * np.log(action_values)))
|
||||
else:
|
||||
# CONTINUOUS
|
||||
action_values_mean, action_values_std = self.policy_network.online_network.predict(observation)
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
else:
|
||||
action = action_values_mean
|
||||
action_info = {"action_probability": action_values_mean}
|
||||
|
||||
return action, action_info
|
||||
64
agents/value_optimization_agent.py
Normal file
64
agents/value_optimization_agent.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.agent import *
|
||||
|
||||
|
||||
class ValueOptimizationAgent(Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
self.q_values = Signal("Q")
|
||||
self.signals.append(self.q_values)
|
||||
|
||||
# Algorithms for which q_values are calculated from predictions will override this function
|
||||
def get_q_values(self, prediction):
|
||||
return prediction
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
if self.tp.agent.use_measurements:
|
||||
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
|
||||
prediction = self.main_network.online_network.predict([observation, measurements])
|
||||
else:
|
||||
prediction = self.main_network.online_network.predict(observation)
|
||||
|
||||
actions_q_values = self.get_q_values(prediction)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
if phase == RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(actions_q_values)
|
||||
else:
|
||||
action = self.evaluation_exploration_policy.get_action(actions_q_values)
|
||||
|
||||
# this is for bootstrapped dqn
|
||||
if type(actions_q_values) == list and len(actions_q_values) > 0:
|
||||
actions_q_values = actions_q_values[self.exploration_policy.selected_head]
|
||||
actions_q_values = actions_q_values.squeeze()
|
||||
|
||||
# store the q values statistics for logging
|
||||
self.q_values.add_sample(actions_q_values)
|
||||
|
||||
# store information for plotting interactively (actual plotting is done in agent)
|
||||
if self.tp.visualization.plot_action_values_online:
|
||||
for idx, action_name in enumerate(self.env.actions_description):
|
||||
self.episode_running_info[action_name].append(actions_q_values[idx])
|
||||
|
||||
action_value = {"action_value": actions_q_values[action]}
|
||||
return action, action_value
|
||||
Reference in New Issue
Block a user