1
0
mirror of https://github.com/gryf/coach.git synced 2026-02-02 05:45:45 +01:00

Cleanup imports.

Till now, most of the modules were importing all of the module objects
(variables, classes, functions, other imports) into module namespace,
which potentially could (and was) cause of unintentional use of class or
methods, which was indirect imported.

With this patch, all the star imports were substituted with top-level
module, which provides desired class or function.

Besides, all imports where sorted (where possible) in a way pep8[1]
suggests - first are imports from standard library, than goes third
party imports (like numpy, tensorflow etc) and finally coach modules.
All of those sections are separated by one empty line.

[1] https://www.python.org/dev/peps/pep-0008/#imports
This commit is contained in:
Roman Dobosz
2018-04-12 19:46:32 +02:00
parent cafa152382
commit 1b095aeeca
75 changed files with 1169 additions and 1139 deletions

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,26 +13,48 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from agents.actor_critic_agent import ActorCriticAgent
from agents.agent import Agent
from agents.bc_agent import BCAgent
from agents.bootstrapped_dqn_agent import BootstrappedDQNAgent
from agents.categorical_dqn_agent import CategoricalDQNAgent
from agents.clipped_ppo_agent import ClippedPPOAgent
from agents.ddpg_agent import DDPGAgent
from agents.ddqn_agent import DDQNAgent
from agents.dfp_agent import DFPAgent
from agents.dqn_agent import DQNAgent
from agents.human_agent import HumanAgent
from agents.imitation_agent import ImitationAgent
from agents.mmc_agent import MixedMonteCarloAgent
from agents.n_step_q_agent import NStepQAgent
from agents.naf_agent import NAFAgent
from agents.nec_agent import NECAgent
from agents.pal_agent import PALAgent
from agents.policy_gradients_agent import PolicyGradientsAgent
from agents.policy_optimization_agent import PolicyOptimizationAgent
from agents.ppo_agent import PPOAgent
from agents.qr_dqn_agent import QuantileRegressionDQNAgent
from agents.value_optimization_agent import ValueOptimizationAgent
from agents.actor_critic_agent import *
from agents.agent import *
from agents.bc_agent import *
from agents.bootstrapped_dqn_agent import *
from agents.clipped_ppo_agent import *
from agents.ddpg_agent import *
from agents.ddqn_agent import *
from agents.dfp_agent import *
from agents.dqn_agent import *
from agents.categorical_dqn_agent import *
from agents.human_agent import *
from agents.imitation_agent import *
from agents.mmc_agent import *
from agents.n_step_q_agent import *
from agents.naf_agent import *
from agents.nec_agent import *
from agents.pal_agent import *
from agents.policy_gradients_agent import *
from agents.policy_optimization_agent import *
from agents.ppo_agent import *
from agents.value_optimization_agent import *
from agents.qr_dqn_agent import *
__all__ = [ActorCriticAgent,
Agent,
BCAgent,
BootstrappedDQNAgent,
CategoricalDQNAgent,
ClippedPPOAgent,
DDPGAgent,
DDQNAgent,
DFPAgent,
DQNAgent,
HumanAgent,
ImitationAgent,
MixedMonteCarloAgent,
NAFAgent,
NECAgent,
NStepQAgent,
PALAgent,
PPOAgent,
PolicyGradientsAgent,
PolicyOptimizationAgent,
QuantileRegressionDQNAgent,
ValueOptimizationAgent]

View File

@@ -13,23 +13,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from scipy import signal
from agents.policy_optimization_agent import *
from logger import *
from utils import *
import scipy.signal
from agents import policy_optimization_agent as poa
import utils
import logger
# Actor Critic - https://arxiv.org/abs/1602.01783
class ActorCriticAgent(PolicyOptimizationAgent):
class ActorCriticAgent(poa.PolicyOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
self.last_gradient_update_step_idx = 0
self.action_advantages = Signal('Advantages')
self.state_values = Signal('Values')
self.unclipped_grads = Signal('Grads (unclipped)')
self.value_loss = Signal('Value Loss')
self.policy_loss = Signal('Policy Loss')
self.action_advantages = utils.Signal('Advantages')
self.state_values = utils.Signal('Values')
self.unclipped_grads = utils.Signal('Grads (unclipped)')
self.value_loss = utils.Signal('Value Loss')
self.policy_loss = utils.Signal('Policy Loss')
self.signals.append(self.action_advantages)
self.signals.append(self.state_values)
self.signals.append(self.unclipped_grads)
@@ -38,7 +39,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
# Discounting function used to calculate discounted returns.
def discount(self, x, gamma):
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
return signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
def get_general_advantage_estimation_values(self, rewards, values):
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
@@ -72,20 +73,20 @@ class ActorCriticAgent(PolicyOptimizationAgent):
# estimate the advantage function
action_advantages = np.zeros((num_transitions, 1))
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
if game_overs[-1]:
R = 0
else:
R = self.main_network.online_network.predict(last_sample(next_states))[0]
R = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
for i in reversed(range(num_transitions)):
R = rewards[i] + self.tp.agent.discount * R
state_value_head_targets[i] = R
action_advantages[i] = R - current_state_values[i]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
# get bootstraps
bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0]
bootstrapped_value = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
values = np.append(current_state_values, bootstrapped_value)
if game_overs[-1]:
values[-1] = 0
@@ -94,7 +95,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
action_advantages = np.vstack(gae_values)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
action_advantages = action_advantages.squeeze(axis=-1)
if not self.env.discrete_controls and len(actions.shape) < 2:
@@ -113,7 +114,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
# TODO: rename curr_state -> state
# convert to batch so we can run it through the network
@@ -126,7 +127,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
# DISCRETE
state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
action_probabilities = action_probabilities.squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_probabilities)
else:
action = np.argmax(action_probabilities)
@@ -137,7 +138,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
action_values_mean = action_values_mean.squeeze()
action_values_std = action_values_std.squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
else:
action = action_values_mean

View File

@@ -13,32 +13,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import scipy.ndimage
try:
import matplotlib.pyplot as plt
except:
from logger import failed_imports
failed_imports.append("matplotlib")
import copy
from renderer import Renderer
from configurations import Preset
from collections import deque
from utils import LazyStack
from collections import OrderedDict
from utils import RunPhase, Signal, is_empty, RunningStat
from architectures import *
from exploration_policies import *
from memories import *
from memories.memory import *
from logger import logger, screen
import collections
import random
import time
import os
import itertools
from architectures.tensorflow_components.shared_variables import SharedRunningStats
import logger
try:
import matplotlib.pyplot as plt
except ImportError:
logger.failed_imports.append("matplotlib")
import numpy as np
from pandas.io import pickle
from six.moves import range
import scipy
from architectures.tensorflow_components import shared_variables as sv
import configurations
import exploration_policies as ep
import memories
from memories import memory
import renderer
import utils
class Agent(object):
@@ -54,7 +50,7 @@ class Agent(object):
:param thread_id: int
"""
screen.log_title("Creating agent {}".format(task_id))
logger.screen.log_title("Creating agent {}".format(task_id))
self.task_id = task_id
self.sess = tuning_parameters.sess
self.env = tuning_parameters.env_instance = env
@@ -71,21 +67,20 @@ class Agent(object):
# modules
if tuning_parameters.agent.load_memory_from_file_path:
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
logger.screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
.format(tuning_parameters.agent.load_memory_from_file_path))
self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path)
self.memory = pickle.read_pickle(tuning_parameters.agent.load_memory_from_file_path)
else:
self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
# self.architecture = eval(tuning_parameters.architecture)
self.memory = eval('memories.' + tuning_parameters.memory + '(tuning_parameters)')
self.has_global = replicated_device is not None
self.replicated_device = replicated_device
self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)')
self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy
self.exploration_policy = eval('ep.' + tuning_parameters.exploration.policy + '(tuning_parameters)')
self.evaluation_exploration_policy = eval('ep.' + tuning_parameters.exploration.evaluation_policy
+ '(tuning_parameters)')
self.evaluation_exploration_policy.change_phase(RunPhase.TEST)
self.evaluation_exploration_policy.change_phase(utils.RunPhase.TEST)
# initialize all internal variables
self.tp = tuning_parameters
@@ -100,30 +95,30 @@ class Agent(object):
self.episode_running_info = {}
self.last_episode_evaluation_ran = 0
self.running_observations = []
logger.set_current_time(self.current_episode)
logger.logger.set_current_time(self.current_episode)
self.main_network = None
self.networks = []
self.last_episode_images = []
self.renderer = Renderer()
self.renderer = renderer.Renderer()
# signals
self.signals = []
self.loss = Signal('Loss')
self.loss = utils.Signal('Loss')
self.signals.append(self.loss)
self.curr_learning_rate = Signal('Learning Rate')
self.curr_learning_rate = utils.Signal('Learning Rate')
self.signals.append(self.curr_learning_rate)
if self.tp.env.normalize_observation and not self.env.is_state_type_image:
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,))
self.running_reward_stats = RunningStat(())
self.running_observation_stats = utils.RunningStat((self.tp.env.desired_observation_width,))
self.running_reward_stats = utils.RunningStat(())
else:
self.running_observation_stats = SharedRunningStats(self.tp, replicated_device,
shape=(self.tp.env.desired_observation_width,),
name='observation_stats')
self.running_reward_stats = SharedRunningStats(self.tp, replicated_device,
shape=(),
name='reward_stats')
self.running_observation_stats = sv.SharedRunningStats(self.tp, replicated_device,
shape=(self.tp.env.desired_observation_width,),
name='observation_stats')
self.running_reward_stats = sv.SharedRunningStats(self.tp, replicated_device,
shape=(),
name='reward_stats')
# env is already reset at this point. Otherwise we're getting an error where you cannot
# reset an env which is not done
@@ -137,13 +132,13 @@ class Agent(object):
def log_to_screen(self, phase):
# log to screen
if self.current_episode >= 0:
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
exploration = self.exploration_policy.get_control_param()
else:
exploration = self.evaluation_exploration_policy.get_control_param()
screen.log_dict(
OrderedDict([
logger.screen.log_dict(
collections.OrderedDict([
("Worker", self.task_id),
("Episode", self.current_episode),
("total reward", self.total_reward_in_current_episode),
@@ -154,37 +149,37 @@ class Agent(object):
prefix=phase
)
def update_log(self, phase=RunPhase.TRAIN):
def update_log(self, phase=utils.RunPhase.TRAIN):
"""
Writes logging messages to screen and updates the log file with all the signal values.
:return: None
"""
# log all the signals to file
logger.set_current_time(self.current_episode)
logger.create_signal_value('Training Iter', self.training_iteration)
logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
logger.create_signal_value('ER #Episodes', self.memory.length())
logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
logger.create_signal_value('Total steps', self.total_steps_counter)
logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
if phase == RunPhase.TRAIN else np.nan)
logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
if phase == RunPhase.TEST else np.nan)
logger.create_signal_value('Update Target Network', 0, overwrite=False)
logger.update_wall_clock_time(self.current_episode)
logger.logger.set_current_time(self.current_episode)
logger.logger.create_signal_value('Training Iter', self.training_iteration)
logger.logger.create_signal_value('In Heatup', int(phase == utils.RunPhase.HEATUP))
logger.logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
logger.logger.create_signal_value('ER #Episodes', self.memory.length())
logger.logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
logger.logger.create_signal_value('Total steps', self.total_steps_counter)
logger.logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
logger.logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
if phase == utils.RunPhase.TRAIN else np.nan)
logger.logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
if phase == utils.RunPhase.TEST else np.nan)
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
logger.logger.update_wall_clock_time(self.current_episode)
for signal in self.signals:
logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
logger.logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
logger.logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
logger.logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
logger.logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
# dump
if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
and self.current_episode > 0:
logger.dump_output_csv()
logger.logger.dump_output_csv()
def reset_game(self, do_not_reset_env=False):
"""
@@ -211,7 +206,7 @@ class Agent(object):
self.episode_running_info[action] = []
plt.clf()
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
if self.tp.agent.middleware_type == configurations.MiddlewareTypes.LSTM:
for network in self.networks:
network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init
@@ -281,9 +276,9 @@ class Agent(object):
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
for network in self.networks:
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
logger.create_signal_value('Update Target Network', 1)
logger.logger.create_signal_value('Update Target Network', 1)
else:
logger.create_signal_value('Update Target Network', 0, overwrite=False)
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
return loss
@@ -321,7 +316,7 @@ class Agent(object):
plt.legend()
plt.pause(0.00000001)
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
"""
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
or testing.
@@ -351,15 +346,15 @@ class Agent(object):
for input_name in self.tp.agent.input_types.keys():
input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0)
return input_state
def prepare_initial_state(self):
"""
Create an initial state when starting a new episode
:return: None
"""
observation = self.preprocess_observation(self.env.state['observation'])
self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
observation = LazyStack(self.curr_stack, -1)
self.curr_stack = collections.deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
observation = utils.LazyStack(self.curr_stack, -1)
self.curr_state = {
'observation': observation
@@ -369,21 +364,21 @@ class Agent(object):
if self.tp.agent.use_accumulated_reward_as_measurement:
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
def act(self, phase=RunPhase.TRAIN):
def act(self, phase=utils.RunPhase.TRAIN):
"""
Take one step in the environment according to the network prediction and store the transition in memory
:param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
:return: A boolean value that signals an episode termination
"""
if phase != RunPhase.TEST:
if phase != utils.RunPhase.TEST:
self.total_steps_counter += 1
self.current_episode_steps_counter += 1
# get new action
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0}
if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
if phase == utils.RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
action = self.env.get_random_action()
else:
action, action_info = self.choose_action(self.curr_state, phase=phase)
@@ -402,13 +397,13 @@ class Agent(object):
next_state['observation'] = self.preprocess_observation(next_state['observation'])
# plot action values online
if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
if self.tp.visualization.plot_action_values_online and phase != utils.RunPhase.HEATUP:
self.plot_action_values_online()
# initialize the next state
# TODO: provide option to stack more than just the observation
self.curr_stack.append(next_state['observation'])
observation = LazyStack(self.curr_stack, -1)
observation = utils.LazyStack(self.curr_stack, -1)
next_state['observation'] = observation
if self.tp.agent.use_measurements and 'measurements' in result.keys():
@@ -417,14 +412,14 @@ class Agent(object):
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
# store the transition only if we are training
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
transition = memory.Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
for key in action_info.keys():
transition.info[key] = action_info[key]
if self.tp.agent.add_a_normalized_timestep_to_the_observation:
transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
self.memory.store(transition)
elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
elif phase == utils.RunPhase.TEST and self.tp.visualization.dump_gifs:
# we store the transitions only for saving gifs
self.last_episode_images.append(self.env.get_rendered_image())
@@ -437,7 +432,7 @@ class Agent(object):
self.update_log(phase=phase)
self.log_to_screen(phase=phase)
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
self.reset_game()
self.current_episode += 1
@@ -456,8 +451,8 @@ class Agent(object):
max_reward_achieved = -float('inf')
average_evaluation_reward = 0
screen.log_title("Running evaluation")
self.env.change_phase(RunPhase.TEST)
logger.screen.log_title("Running evaluation")
self.env.change_phase(utils.RunPhase.TEST)
for i in range(num_episodes):
# keep the online network in sync with the global network
if keep_networks_synced:
@@ -466,7 +461,7 @@ class Agent(object):
episode_ended = False
while not episode_ended:
episode_ended = self.act(phase=RunPhase.TEST)
episode_ended = self.act(phase=utils.RunPhase.TEST)
if keep_networks_synced \
and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps:
@@ -477,7 +472,7 @@ class Agent(object):
max_reward_achieved = self.total_reward_in_current_episode
frame_skipping = int(5/self.tp.env.frame_skip)
if self.tp.visualization.dump_gifs:
logger.create_gif(self.last_episode_images[::frame_skipping],
logger.logger.create_gif(self.last_episode_images[::frame_skipping],
name='score-{}'.format(max_reward_achieved), fps=10)
average_evaluation_reward += self.total_reward_in_current_episode
@@ -485,8 +480,8 @@ class Agent(object):
average_evaluation_reward /= float(num_episodes)
self.env.change_phase(RunPhase.TRAIN)
screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
self.env.change_phase(utils.RunPhase.TRAIN)
logger.screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
def post_training_commands(self):
pass
@@ -505,15 +500,15 @@ class Agent(object):
# heatup phase
if self.tp.num_heatup_steps != 0:
self.in_heatup = True
screen.log_title("Starting heatup {}".format(self.task_id))
logger.screen.log_title("Starting heatup {}".format(self.task_id))
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
self.act(phase=RunPhase.HEATUP)
self.act(phase=utils.RunPhase.HEATUP)
# training phase
self.in_heatup = False
screen.log_title("Starting training {}".format(self.task_id))
self.exploration_policy.change_phase(RunPhase.TRAIN)
logger.screen.log_title("Starting training {}".format(self.task_id))
self.exploration_policy.change_phase(utils.RunPhase.TRAIN)
training_start_time = time.time()
model_snapshots_periods_passed = -1
self.reset_game()
@@ -557,7 +552,7 @@ class Agent(object):
self.loss.add_sample(loss)
self.training_iteration += 1
if self.imitation:
self.log_to_screen(RunPhase.TRAIN)
self.log_to_screen(utils.RunPhase.TRAIN)
self.post_training_commands()
def save_model(self, model_id):

View File

@@ -13,16 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.imitation_agent import ImitationAgent
from agents import imitation_agent
# Behavioral Cloning Agent
class BCAgent(ImitationAgent):
class BCAgent(imitation_agent.ImitationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
imitation_agent.ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
def learn_from_batch(self, batch):
current_states, _, actions, _, _, _ = self.extract_batch(batch)

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,17 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
import utils
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
class BootstrappedDQNAgent(ValueOptimizationAgent):
class BootstrappedDQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
def reset_game(self, do_not_reset_env=False):
ValueOptimizationAgent.reset_game(self, do_not_reset_env)
voa.ValueOptimizationAgent.reset_game(self, do_not_reset_env)
self.exploration_policy.select_head()
def learn_from_batch(self, batch):
@@ -51,8 +52,8 @@ class BootstrappedDQNAgent(ValueOptimizationAgent):
return total_loss
def act(self, phase=RunPhase.TRAIN):
ValueOptimizationAgent.act(self, phase)
def act(self, phase=utils.RunPhase.TRAIN):
voa.ValueOptimizationAgent.act(self, phase)
mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
self.tp.exploration.architecture_num_q_heads)
self.memory.update_last_transition_info({'mask': mask})

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
class CategoricalDQNAgent(ValueOptimizationAgent):
class CategoricalDQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
# prediction's format is (batch,actions,atoms)
@@ -57,4 +58,3 @@ class CategoricalDQNAgent(ValueOptimizationAgent):
total_loss = result[0]
return total_loss

View File

@@ -13,27 +13,34 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from agents.actor_critic_agent import *
import collections
import copy
from random import shuffle
import numpy as np
from agents import actor_critic_agent as aca
from agents import policy_optimization_agent as poa
import logger
import utils
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
class ClippedPPOAgent(ActorCriticAgent):
class ClippedPPOAgent(aca.ActorCriticAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
# signals definition
self.value_loss = Signal('Value Loss')
self.value_loss = utils.Signal('Value Loss')
self.signals.append(self.value_loss)
self.policy_loss = Signal('Policy Loss')
self.policy_loss = utils.Signal('Policy Loss')
self.signals.append(self.policy_loss)
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = Signal('Grads (unclipped)')
self.unclipped_grads = utils.Signal('Grads (unclipped)')
self.signals.append(self.unclipped_grads)
self.value_targets = Signal('Value Targets')
self.value_targets = utils.Signal('Value Targets')
self.signals.append(self.value_targets)
self.kl_divergence = Signal('KL Divergence')
self.kl_divergence = utils.Signal('KL Divergence')
self.signals.append(self.kl_divergence)
def fill_advantages(self, batch):
@@ -46,9 +53,9 @@ class ClippedPPOAgent(ActorCriticAgent):
# calculate advantages
advantages = []
value_targets = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
advantages = total_return - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
@@ -66,7 +73,7 @@ class ClippedPPOAgent(ActorCriticAgent):
advantages = np.append(advantages, rollout_advantages)
value_targets = np.append(value_targets, gae_based_value_targets)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
@@ -144,8 +151,8 @@ class ClippedPPOAgent(ActorCriticAgent):
curr_learning_rate = self.tp.learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
logger.screen.log_dict(
collections.OrderedDict([
("Surrogate loss", loss['policy_losses'][0]),
("KL divergence", loss['fetch_result'][0]),
("Entropy", loss['fetch_result'][1]),
@@ -184,13 +191,13 @@ class ClippedPPOAgent(ActorCriticAgent):
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return np.append(losses[0], losses[1])
def choose_action(self, current_state, phase=RunPhase.TRAIN):
def choose_action(self, current_state, phase=utils.RunPhase.TRAIN):
if self.env.discrete_controls:
# DISCRETE
_, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state))
action_values = action_values.squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = np.argmax(action_values)
@@ -201,7 +208,7 @@ class ClippedPPOAgent(ActorCriticAgent):
_, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state))
action_values_mean = action_values_mean.squeeze()
action_values_std = action_values_std.squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
# if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
# print action

View File

@@ -13,28 +13,34 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import copy
from agents.actor_critic_agent import *
from configurations import *
import numpy as np
from agents import actor_critic_agent as aca
from agents import agent
from architectures import network_wrapper as nw
import configurations as conf
import utils
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
class DDPGAgent(ActorCriticAgent):
class DDPGAgent(aca.ActorCriticAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
# define critic network
self.critic_network = self.main_network
# self.networks.append(self.critic_network)
# define actor network
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
tuning_parameters.agent.output_types = [OutputTypes.Pi]
self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
self.replicated_device, self.worker_device)
tuning_parameters.agent.input_types = {'observation': conf.InputTypes.Observation}
tuning_parameters.agent.output_types = [conf.OutputTypes.Pi]
self.actor_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
self.replicated_device, self.worker_device)
self.networks.append(self.actor_network)
self.q_values = Signal("Q")
self.q_values = utils.Signal("Q")
self.signals.append(self.q_values)
self.reset_game(do_not_reset_env=True)
@@ -82,14 +88,14 @@ class DDPGAgent(ActorCriticAgent):
return total_loss
def train(self):
return Agent.train(self)
return agent.Agent.train(self)
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
result = self.actor_network.online_network.predict(self.tf_input_state(curr_state))
action_values = result[0].squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = action_values

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Double DQN - https://arxiv.org/abs/1509.06461
class DDQNAgent(ValueOptimizationAgent):
class DDQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
def learn_from_batch(self, batch):
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)

View File

@@ -13,17 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.agent import *
from agents import agent
from architectures import network_wrapper as nw
import utils
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
class DFPAgent(Agent):
class DFPAgent(agent.Agent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.current_goal = self.tp.agent.goal_vector
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
self.replicated_device, self.worker_device)
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
self.replicated_device, self.worker_device)
self.networks.append(self.main_network)
def learn_from_batch(self, batch):
@@ -45,7 +48,7 @@ class DFPAgent(Agent):
return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
# convert to batch so we can run it through the network
observation = np.expand_dims(np.array(curr_state['observation']), 0)
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
@@ -66,7 +69,7 @@ class DFPAgent(Agent):
self.tp.agent.future_measurements_weights)
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = np.argmax(action_values)

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
class DistributionalDQNAgent(ValueOptimizationAgent):
class DistributionalDQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
# prediction's format is (batch,actions,atoms)
@@ -57,4 +58,3 @@ class DistributionalDQNAgent(ValueOptimizationAgent):
total_loss = result[0]
return total_loss

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
class DQNAgent(ValueOptimizationAgent):
class DQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
def learn_from_batch(self, batch):
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)

View File

@@ -13,31 +13,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
import os
from agents.agent import *
import pygame
from pandas.io import pickle
from agents import agent
import logger
import utils
class HumanAgent(Agent):
class HumanAgent(agent.Agent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.clock = pygame.time.Clock()
self.max_fps = int(self.tp.visualization.max_fps_for_human_control)
screen.log_title("Human Control Mode")
utils.screen.log_title("Human Control Mode")
available_keys = self.env.get_available_keys()
if available_keys:
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
screen.log("")
utils.screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
utils.screen.log("")
for action, key in self.env.get_available_keys():
screen.log("\t- {}: {}".format(action, key))
screen.separator()
utils.screen.log("\t- {}: {}".format(action, key))
utils.screen.separator()
def train(self):
return 0
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
action = self.env.get_action_from_user()
# keep constant fps
@@ -49,16 +55,16 @@ class HumanAgent(Agent):
return action, {"action_value": 0}
def save_replay_buffer_and_exit(self):
replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p')
replay_buffer_path = os.path.join(logger.logger.experiments_path, 'replay_buffer.p')
self.memory.tp = None
to_pickle(self.memory, replay_buffer_path)
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
pickle.to_pickle(self.memory, replay_buffer_path)
utils.screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
exit()
def log_to_screen(self, phase):
# log to screen
screen.log_dict(
OrderedDict([
# log to utils.screen
utils.screen.log_dict(
collections.OrderedDict([
("Episode", self.current_episode),
("total reward", self.total_reward_in_current_episode),
("steps", self.total_steps_counter)

View File

@@ -13,23 +13,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
from agents.agent import *
from agents import agent
from architectures import network_wrapper as nw
import utils
import logging
# Imitation Agent
class ImitationAgent(Agent):
class ImitationAgent(agent.Agent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
self.replicated_device, self.worker_device)
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
self.replicated_device, self.worker_device)
self.networks.append(self.main_network)
self.imitation = True
def extract_action_values(self, prediction):
return prediction.squeeze()
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
# convert to batch so we can run it through the network
prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state))
@@ -49,10 +53,10 @@ class ImitationAgent(Agent):
def log_to_screen(self, phase):
# log to screen
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
# for the training phase - we log during the episode to visualize the progress in training
screen.log_dict(
OrderedDict([
logging.screen.log_dict(
collections.OrderedDict([
("Worker", self.task_id),
("Episode", self.current_episode),
("Loss", self.loss.values[-1]),
@@ -62,4 +66,4 @@ class ImitationAgent(Agent):
)
else:
# for the evaluation phase - logging as in regular RL
Agent.log_to_screen(self, phase)
agent.Agent.log_to_screen(self, phase)

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,13 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
class MixedMonteCarloAgent(ValueOptimizationAgent):
class MixedMonteCarloAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
def learn_from_batch(self, batch):

View File

@@ -14,22 +14,21 @@
# limitations under the License.
#
import numpy as np
import scipy.signal
from agents.value_optimization_agent import ValueOptimizationAgent
from agents.policy_optimization_agent import PolicyOptimizationAgent
from logger import logger
from utils import Signal, last_sample
from agents import value_optimization_agent as voa
from agents import policy_optimization_agent as poa
import logger
import utils
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
class NStepQAgent(voa.ValueOptimizationAgent, poa.PolicyOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
self.last_gradient_update_step_idx = 0
self.q_values = Signal('Q Values')
self.unclipped_grads = Signal('Grads (unclipped)')
self.value_loss = Signal('Value Loss')
self.q_values = utils.Signal('Q Values')
self.unclipped_grads = utils.Signal('Grads (unclipped)')
self.value_loss = utils.Signal('Value Loss')
self.signals.append(self.q_values)
self.signals.append(self.unclipped_grads)
self.signals.append(self.value_loss)
@@ -57,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
if game_overs[-1]:
R = 0
else:
R = np.max(self.main_network.target_network.predict(last_sample(next_states)))
R = np.max(self.main_network.target_network.predict(utils.last_sample(next_states)))
for i in reversed(range(num_transitions)):
R = rewards[i] + self.tp.agent.discount * R
@@ -85,4 +84,4 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
else:
logger.create_signal_value('Update Target Network', 0, overwrite=False)
return PolicyOptimizationAgent.train(self)
return poa.PolicyOptimizationAgent.train(self)

View File

@@ -13,21 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import ValueOptimizationAgent
from utils import RunPhase, Signal
import utils
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
class NAFAgent(ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.l_values = Signal("L")
self.a_values = Signal("Advantage")
self.mu_values = Signal("Action")
self.v_values = Signal("V")
self.l_values = utils.Signal("L")
self.a_values = utils.Signal("Advantage")
self.mu_values = utils.Signal("Action")
self.v_values = utils.Signal("V")
self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
def learn_from_batch(self, batch):
@@ -49,7 +48,7 @@ class NAFAgent(ValueOptimizationAgent):
return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
# convert to batch so we can run it through the network
@@ -60,7 +59,7 @@ class NAFAgent(ValueOptimizationAgent):
outputs=naf_head.mu,
squeeze_output=False,
)
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = action_values

View File

@@ -13,19 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import ValueOptimizationAgent
from agents import value_optimization_agent as voa
from logger import screen
from utils import RunPhase
import utils
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
class NECAgent(ValueOptimizationAgent):
class NECAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=False)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=False)
self.current_episode_state_embeddings = []
self.training_started = False
@@ -52,7 +49,7 @@ class NECAgent(ValueOptimizationAgent):
return total_loss
def act(self, phase=RunPhase.TRAIN):
def act(self, phase=utils.RunPhase.TRAIN):
if self.in_heatup:
# get embedding in heatup (otherwise we get it through choose_action)
embedding = self.main_network.online_network.predict(

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
class PALAgent(ValueOptimizationAgent):
class PALAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.alpha = tuning_parameters.agent.pal_alpha
self.persistent = tuning_parameters.agent.persistent_advantage_learning
self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate

View File

@@ -13,25 +13,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from agents.policy_optimization_agent import *
import numpy as np
from logger import *
import tensorflow as tf
try:
import matplotlib.pyplot as plt
except:
from logger import failed_imports
failed_imports.append("matplotlib")
from utils import *
from agents import policy_optimization_agent as poa
import logger
import utils
class PolicyGradientsAgent(PolicyOptimizationAgent):
class PolicyGradientsAgent(poa.PolicyOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.returns_mean = Signal('Returns Mean')
self.returns_variance = Signal('Returns Variance')
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.returns_mean = utils.Signal('Returns Mean')
self.returns_variance = utils.Signal('Returns Variance')
self.signals.append(self.returns_mean)
self.signals.append(self.returns_variance)
self.last_gradient_update_step_idx = 0
@@ -41,21 +34,21 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
for i in reversed(range(len(total_returns))):
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.TOTAL_RETURN:
total_returns[i] = total_returns[0]
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN:
# just take the total return as it is
pass
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
if self.std_discounted_return != 0:
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
else:
total_returns[i] = 0
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
targets = total_returns
if not self.env.discrete_controls and len(actions.shape) < 2:
@@ -69,12 +62,12 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
return total_loss
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
# convert to batch so we can run it through the network
if self.env.discrete_controls:
# DISCRETE
action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = np.argmax(action_values)
@@ -84,7 +77,7 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
# CONTINUOUS
result = self.main_network.online_network.predict(self.tf_input_state(curr_state))
action_values = result[0].squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = action_values

View File

@@ -1,5 +1,5 @@
#
# Copyright (c) 2017 Intel Corporation
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -13,12 +13,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
from agents.agent import *
from memories.memory import Episode
import numpy as np
from agents import agent
from architectures import network_wrapper as nw
import logger
import utils
class PolicyGradientRescaler(Enum):
class PolicyGradientRescaler(utils.Enum):
TOTAL_RETURN = 0
FUTURE_RETURN = 1
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
@@ -30,11 +35,11 @@ class PolicyGradientRescaler(Enum):
GAE = 8
class PolicyOptimizationAgent(Agent):
class PolicyOptimizationAgent(agent.Agent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
self.replicated_device, self.worker_device)
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
self.replicated_device, self.worker_device)
self.networks.append(self.main_network)
self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler)
@@ -44,7 +49,7 @@ class PolicyOptimizationAgent(Agent):
self.max_episode_length = 100000
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
self.entropy = Signal('Entropy')
self.entropy = utils.Signal('Entropy')
self.signals.append(self.entropy)
self.reset_game(do_not_reset_env=True)
@@ -52,8 +57,8 @@ class PolicyOptimizationAgent(Agent):
def log_to_screen(self, phase):
# log to screen
if self.current_episode > 0:
screen.log_dict(
OrderedDict([
logger.screen.log_dict(
collections.OrderedDict([
("Worker", self.task_id),
("Episode", self.current_episode),
("total reward", self.total_reward_in_current_episode),

View File

@@ -13,36 +13,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import collections
import copy
from agents.actor_critic_agent import *
from random import shuffle
import numpy as np
from agents import actor_critic_agent as aca
from agents import policy_optimization_agent as poa
from architectures import network_wrapper as nw
import configurations
import logger
import utils
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
class PPOAgent(ActorCriticAgent):
class PPOAgent(aca.ActorCriticAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
create_target_network=True)
self.critic_network = self.main_network
# define the policy network
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
tuning_parameters.agent.output_types = [OutputTypes.PPO]
tuning_parameters.agent.input_types = {'observation': configurations.InputTypes.Observation}
tuning_parameters.agent.output_types = [configurations.OutputTypes.PPO]
tuning_parameters.agent.optimizer_type = 'Adam'
tuning_parameters.agent.l2_regularization = 0
self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
self.replicated_device, self.worker_device)
self.policy_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
self.replicated_device, self.worker_device)
self.networks.append(self.policy_network)
# signals definition
self.value_loss = Signal('Value Loss')
self.value_loss = utils.Signal('Value Loss')
self.signals.append(self.value_loss)
self.policy_loss = Signal('Policy Loss')
self.policy_loss = utils.Signal('Policy Loss')
self.signals.append(self.policy_loss)
self.kl_divergence = Signal('KL Divergence')
self.kl_divergence = utils.Signal('KL Divergence')
self.signals.append(self.kl_divergence)
self.total_kl_divergence_during_training_process = 0.0
self.unclipped_grads = Signal('Grads (unclipped)')
self.unclipped_grads = utils.Signal('Grads (unclipped)')
self.signals.append(self.unclipped_grads)
self.reset_game(do_not_reset_env=True)
@@ -57,9 +65,9 @@ class PPOAgent(ActorCriticAgent):
# calculate advantages
advantages = []
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
advantages = total_return - current_state_values
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
# get bootstraps
episode_start_idx = 0
advantages = np.array([])
@@ -76,7 +84,7 @@ class PPOAgent(ActorCriticAgent):
episode_start_idx = idx + 1
advantages = np.append(advantages, rollout_advantages)
else:
screen.warning("WARNING: The requested policy gradient rescaler is not available")
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
# standardize
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
@@ -107,7 +115,7 @@ class PPOAgent(ActorCriticAgent):
for k, v in current_states.items()
}
total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
old_policy_values = force_list(self.critic_network.target_network.predict(
old_policy_values = utils.force_list(self.critic_network.target_network.predict(
current_states_batch).squeeze())
if self.critic_network.online_network.optimizer_type != 'LBFGS':
targets = total_return_batch
@@ -155,7 +163,7 @@ class PPOAgent(ActorCriticAgent):
actions = np.expand_dims(actions, -1)
# get old policy probabilities and distribution
old_policy = force_list(self.policy_network.target_network.predict(current_states))
old_policy = utils.force_list(self.policy_network.target_network.predict(current_states))
# calculate gradients and apply on both the local policy network and on the global policy network
fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
@@ -196,8 +204,8 @@ class PPOAgent(ActorCriticAgent):
curr_learning_rate = self.tp.learning_rate
# log training parameters
screen.log_dict(
OrderedDict([
logger.screen.log_dict(
collections.OrderedDict([
("Surrogate loss", loss['policy_losses'][0]),
("KL divergence", loss['fetch_result'][0]),
("Entropy", loss['fetch_result'][1]),
@@ -215,7 +223,7 @@ class PPOAgent(ActorCriticAgent):
def update_kl_coefficient(self):
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
# his implementation for now because we know it works well
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
logger.screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
# update kl coefficient
kl_target = self.tp.agent.target_kl_divergence
@@ -236,7 +244,7 @@ class PPOAgent(ActorCriticAgent):
new_kl_coefficient,
self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
logger.screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
def post_training_commands(self):
if self.tp.agent.use_kl_regularization:
@@ -264,12 +272,12 @@ class PPOAgent(ActorCriticAgent):
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
return np.append(value_loss, policy_loss)
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
if self.env.discrete_controls:
# DISCRETE
action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = self.exploration_policy.get_action(action_values)
else:
action = np.argmax(action_values)
@@ -280,7 +288,7 @@ class PPOAgent(ActorCriticAgent):
action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state))
action_values_mean = action_values_mean.squeeze()
action_values_std = action_values_std.squeeze()
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
else:
action = action_values_mean

View File

@@ -13,14 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.value_optimization_agent import *
from agents import value_optimization_agent as voa
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
class QuantileRegressionDQNAgent(voa.ValueOptimizationAgent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms)
# prediction's format is (batch,actions,atoms)

View File

@@ -13,21 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import numpy as np
from agents.agent import Agent
from architectures.network_wrapper import NetworkWrapper
from utils import RunPhase, Signal
from agents import agent
from architectures import network_wrapper as nw
import utils
class ValueOptimizationAgent(Agent):
class ValueOptimizationAgent(agent.Agent):
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
self.replicated_device, self.worker_device)
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
self.replicated_device, self.worker_device)
self.networks.append(self.main_network)
self.q_values = Signal("Q")
self.q_values = utils.Signal("Q")
self.signals.append(self.q_values)
self.reset_game(do_not_reset_env=True)
@@ -47,12 +46,12 @@ class ValueOptimizationAgent(Agent):
'require exploration policies which return a single action.'
).format(policy.__class__.__name__))
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
prediction = self.get_prediction(curr_state)
actions_q_values = self.get_q_values(prediction)
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
if phase == RunPhase.TRAIN:
if phase == utils.RunPhase.TRAIN:
exploration_policy = self.exploration_policy
else:
exploration_policy = self.evaluation_exploration_policy