mirror of
https://github.com/gryf/coach.git
synced 2026-02-02 05:45:45 +01:00
Cleanup imports.
Till now, most of the modules were importing all of the module objects (variables, classes, functions, other imports) into module namespace, which potentially could (and was) cause of unintentional use of class or methods, which was indirect imported. With this patch, all the star imports were substituted with top-level module, which provides desired class or function. Besides, all imports where sorted (where possible) in a way pep8[1] suggests - first are imports from standard library, than goes third party imports (like numpy, tensorflow etc) and finally coach modules. All of those sections are separated by one empty line. [1] https://www.python.org/dev/peps/pep-0008/#imports
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,26 +13,48 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from agents.actor_critic_agent import ActorCriticAgent
|
||||
from agents.agent import Agent
|
||||
from agents.bc_agent import BCAgent
|
||||
from agents.bootstrapped_dqn_agent import BootstrappedDQNAgent
|
||||
from agents.categorical_dqn_agent import CategoricalDQNAgent
|
||||
from agents.clipped_ppo_agent import ClippedPPOAgent
|
||||
from agents.ddpg_agent import DDPGAgent
|
||||
from agents.ddqn_agent import DDQNAgent
|
||||
from agents.dfp_agent import DFPAgent
|
||||
from agents.dqn_agent import DQNAgent
|
||||
from agents.human_agent import HumanAgent
|
||||
from agents.imitation_agent import ImitationAgent
|
||||
from agents.mmc_agent import MixedMonteCarloAgent
|
||||
from agents.n_step_q_agent import NStepQAgent
|
||||
from agents.naf_agent import NAFAgent
|
||||
from agents.nec_agent import NECAgent
|
||||
from agents.pal_agent import PALAgent
|
||||
from agents.policy_gradients_agent import PolicyGradientsAgent
|
||||
from agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||
from agents.ppo_agent import PPOAgent
|
||||
from agents.qr_dqn_agent import QuantileRegressionDQNAgent
|
||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from agents.agent import *
|
||||
from agents.bc_agent import *
|
||||
from agents.bootstrapped_dqn_agent import *
|
||||
from agents.clipped_ppo_agent import *
|
||||
from agents.ddpg_agent import *
|
||||
from agents.ddqn_agent import *
|
||||
from agents.dfp_agent import *
|
||||
from agents.dqn_agent import *
|
||||
from agents.categorical_dqn_agent import *
|
||||
from agents.human_agent import *
|
||||
from agents.imitation_agent import *
|
||||
from agents.mmc_agent import *
|
||||
from agents.n_step_q_agent import *
|
||||
from agents.naf_agent import *
|
||||
from agents.nec_agent import *
|
||||
from agents.pal_agent import *
|
||||
from agents.policy_gradients_agent import *
|
||||
from agents.policy_optimization_agent import *
|
||||
from agents.ppo_agent import *
|
||||
from agents.value_optimization_agent import *
|
||||
from agents.qr_dqn_agent import *
|
||||
__all__ = [ActorCriticAgent,
|
||||
Agent,
|
||||
BCAgent,
|
||||
BootstrappedDQNAgent,
|
||||
CategoricalDQNAgent,
|
||||
ClippedPPOAgent,
|
||||
DDPGAgent,
|
||||
DDQNAgent,
|
||||
DFPAgent,
|
||||
DQNAgent,
|
||||
HumanAgent,
|
||||
ImitationAgent,
|
||||
MixedMonteCarloAgent,
|
||||
NAFAgent,
|
||||
NECAgent,
|
||||
NStepQAgent,
|
||||
PALAgent,
|
||||
PPOAgent,
|
||||
PolicyGradientsAgent,
|
||||
PolicyOptimizationAgent,
|
||||
QuantileRegressionDQNAgent,
|
||||
ValueOptimizationAgent]
|
||||
|
||||
@@ -13,23 +13,24 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
from scipy import signal
|
||||
|
||||
from agents.policy_optimization_agent import *
|
||||
from logger import *
|
||||
from utils import *
|
||||
import scipy.signal
|
||||
from agents import policy_optimization_agent as poa
|
||||
import utils
|
||||
import logger
|
||||
|
||||
|
||||
# Actor Critic - https://arxiv.org/abs/1602.01783
|
||||
class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
class ActorCriticAgent(poa.PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
|
||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
|
||||
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.action_advantages = Signal('Advantages')
|
||||
self.state_values = Signal('Values')
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.policy_loss = Signal('Policy Loss')
|
||||
self.action_advantages = utils.Signal('Advantages')
|
||||
self.state_values = utils.Signal('Values')
|
||||
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||
self.value_loss = utils.Signal('Value Loss')
|
||||
self.policy_loss = utils.Signal('Policy Loss')
|
||||
self.signals.append(self.action_advantages)
|
||||
self.signals.append(self.state_values)
|
||||
self.signals.append(self.unclipped_grads)
|
||||
@@ -38,7 +39,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
|
||||
# Discounting function used to calculate discounted returns.
|
||||
def discount(self, x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
return signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
def get_general_advantage_estimation_values(self, rewards, values):
|
||||
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
||||
@@ -72,20 +73,20 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
# estimate the advantage function
|
||||
action_advantages = np.zeros((num_transitions, 1))
|
||||
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||
if game_overs[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = self.main_network.online_network.predict(last_sample(next_states))[0]
|
||||
R = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
R = rewards[i] + self.tp.agent.discount * R
|
||||
state_value_head_targets[i] = R
|
||||
action_advantages[i] = R - current_state_values[i]
|
||||
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0]
|
||||
bootstrapped_value = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
|
||||
values = np.append(current_state_values, bootstrapped_value)
|
||||
if game_overs[-1]:
|
||||
values[-1] = 0
|
||||
@@ -94,7 +95,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
|
||||
action_advantages = np.vstack(gae_values)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
action_advantages = action_advantages.squeeze(axis=-1)
|
||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||
@@ -113,7 +114,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
# TODO: rename curr_state -> state
|
||||
|
||||
# convert to batch so we can run it through the network
|
||||
@@ -126,7 +127,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
# DISCRETE
|
||||
state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
|
||||
action_probabilities = action_probabilities.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_probabilities)
|
||||
else:
|
||||
action = np.argmax(action_probabilities)
|
||||
@@ -137,7 +138,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
||||
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
else:
|
||||
action = action_values_mean
|
||||
|
||||
181
agents/agent.py
181
agents/agent.py
@@ -13,32 +13,28 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import scipy.ndimage
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except:
|
||||
from logger import failed_imports
|
||||
failed_imports.append("matplotlib")
|
||||
|
||||
import copy
|
||||
from renderer import Renderer
|
||||
from configurations import Preset
|
||||
from collections import deque
|
||||
from utils import LazyStack
|
||||
from collections import OrderedDict
|
||||
from utils import RunPhase, Signal, is_empty, RunningStat
|
||||
from architectures import *
|
||||
from exploration_policies import *
|
||||
from memories import *
|
||||
from memories.memory import *
|
||||
from logger import logger, screen
|
||||
import collections
|
||||
import random
|
||||
import time
|
||||
import os
|
||||
import itertools
|
||||
from architectures.tensorflow_components.shared_variables import SharedRunningStats
|
||||
|
||||
import logger
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except ImportError:
|
||||
logger.failed_imports.append("matplotlib")
|
||||
|
||||
import numpy as np
|
||||
from pandas.io import pickle
|
||||
from six.moves import range
|
||||
import scipy
|
||||
|
||||
from architectures.tensorflow_components import shared_variables as sv
|
||||
import configurations
|
||||
import exploration_policies as ep
|
||||
import memories
|
||||
from memories import memory
|
||||
import renderer
|
||||
import utils
|
||||
|
||||
|
||||
class Agent(object):
|
||||
@@ -54,7 +50,7 @@ class Agent(object):
|
||||
:param thread_id: int
|
||||
"""
|
||||
|
||||
screen.log_title("Creating agent {}".format(task_id))
|
||||
logger.screen.log_title("Creating agent {}".format(task_id))
|
||||
self.task_id = task_id
|
||||
self.sess = tuning_parameters.sess
|
||||
self.env = tuning_parameters.env_instance = env
|
||||
@@ -71,21 +67,20 @@ class Agent(object):
|
||||
|
||||
# modules
|
||||
if tuning_parameters.agent.load_memory_from_file_path:
|
||||
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
||||
logger.screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
||||
.format(tuning_parameters.agent.load_memory_from_file_path))
|
||||
self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path)
|
||||
self.memory = pickle.read_pickle(tuning_parameters.agent.load_memory_from_file_path)
|
||||
else:
|
||||
self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
|
||||
# self.architecture = eval(tuning_parameters.architecture)
|
||||
self.memory = eval('memories.' + tuning_parameters.memory + '(tuning_parameters)')
|
||||
|
||||
self.has_global = replicated_device is not None
|
||||
self.replicated_device = replicated_device
|
||||
self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
|
||||
|
||||
self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)')
|
||||
self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy
|
||||
self.exploration_policy = eval('ep.' + tuning_parameters.exploration.policy + '(tuning_parameters)')
|
||||
self.evaluation_exploration_policy = eval('ep.' + tuning_parameters.exploration.evaluation_policy
|
||||
+ '(tuning_parameters)')
|
||||
self.evaluation_exploration_policy.change_phase(RunPhase.TEST)
|
||||
self.evaluation_exploration_policy.change_phase(utils.RunPhase.TEST)
|
||||
|
||||
# initialize all internal variables
|
||||
self.tp = tuning_parameters
|
||||
@@ -100,30 +95,30 @@ class Agent(object):
|
||||
self.episode_running_info = {}
|
||||
self.last_episode_evaluation_ran = 0
|
||||
self.running_observations = []
|
||||
logger.set_current_time(self.current_episode)
|
||||
logger.logger.set_current_time(self.current_episode)
|
||||
self.main_network = None
|
||||
self.networks = []
|
||||
self.last_episode_images = []
|
||||
self.renderer = Renderer()
|
||||
self.renderer = renderer.Renderer()
|
||||
|
||||
# signals
|
||||
self.signals = []
|
||||
self.loss = Signal('Loss')
|
||||
self.loss = utils.Signal('Loss')
|
||||
self.signals.append(self.loss)
|
||||
self.curr_learning_rate = Signal('Learning Rate')
|
||||
self.curr_learning_rate = utils.Signal('Learning Rate')
|
||||
self.signals.append(self.curr_learning_rate)
|
||||
|
||||
if self.tp.env.normalize_observation and not self.env.is_state_type_image:
|
||||
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
|
||||
self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,))
|
||||
self.running_reward_stats = RunningStat(())
|
||||
self.running_observation_stats = utils.RunningStat((self.tp.env.desired_observation_width,))
|
||||
self.running_reward_stats = utils.RunningStat(())
|
||||
else:
|
||||
self.running_observation_stats = SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(self.tp.env.desired_observation_width,),
|
||||
name='observation_stats')
|
||||
self.running_reward_stats = SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(),
|
||||
name='reward_stats')
|
||||
self.running_observation_stats = sv.SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(self.tp.env.desired_observation_width,),
|
||||
name='observation_stats')
|
||||
self.running_reward_stats = sv.SharedRunningStats(self.tp, replicated_device,
|
||||
shape=(),
|
||||
name='reward_stats')
|
||||
|
||||
# env is already reset at this point. Otherwise we're getting an error where you cannot
|
||||
# reset an env which is not done
|
||||
@@ -137,13 +132,13 @@ class Agent(object):
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
if self.current_episode >= 0:
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
exploration = self.exploration_policy.get_control_param()
|
||||
else:
|
||||
exploration = self.evaluation_exploration_policy.get_control_param()
|
||||
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
logger.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Worker", self.task_id),
|
||||
("Episode", self.current_episode),
|
||||
("total reward", self.total_reward_in_current_episode),
|
||||
@@ -154,37 +149,37 @@ class Agent(object):
|
||||
prefix=phase
|
||||
)
|
||||
|
||||
def update_log(self, phase=RunPhase.TRAIN):
|
||||
def update_log(self, phase=utils.RunPhase.TRAIN):
|
||||
"""
|
||||
Writes logging messages to screen and updates the log file with all the signal values.
|
||||
:return: None
|
||||
"""
|
||||
# log all the signals to file
|
||||
logger.set_current_time(self.current_episode)
|
||||
logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
|
||||
logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
|
||||
logger.create_signal_value('ER #Episodes', self.memory.length())
|
||||
logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||
logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||
logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
||||
if phase == RunPhase.TRAIN else np.nan)
|
||||
logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
|
||||
if phase == RunPhase.TEST else np.nan)
|
||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
logger.update_wall_clock_time(self.current_episode)
|
||||
logger.logger.set_current_time(self.current_episode)
|
||||
logger.logger.create_signal_value('Training Iter', self.training_iteration)
|
||||
logger.logger.create_signal_value('In Heatup', int(phase == utils.RunPhase.HEATUP))
|
||||
logger.logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
|
||||
logger.logger.create_signal_value('ER #Episodes', self.memory.length())
|
||||
logger.logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||
logger.logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||
logger.logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||
logger.logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
||||
if phase == utils.RunPhase.TRAIN else np.nan)
|
||||
logger.logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
|
||||
if phase == utils.RunPhase.TEST else np.nan)
|
||||
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
logger.logger.update_wall_clock_time(self.current_episode)
|
||||
|
||||
for signal in self.signals:
|
||||
logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||
logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||
logger.logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||
logger.logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||
logger.logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||
logger.logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||
|
||||
# dump
|
||||
if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
|
||||
and self.current_episode > 0:
|
||||
logger.dump_output_csv()
|
||||
logger.logger.dump_output_csv()
|
||||
|
||||
def reset_game(self, do_not_reset_env=False):
|
||||
"""
|
||||
@@ -211,7 +206,7 @@ class Agent(object):
|
||||
self.episode_running_info[action] = []
|
||||
plt.clf()
|
||||
|
||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
||||
if self.tp.agent.middleware_type == configurations.MiddlewareTypes.LSTM:
|
||||
for network in self.networks:
|
||||
network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
|
||||
network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init
|
||||
@@ -281,9 +276,9 @@ class Agent(object):
|
||||
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
|
||||
for network in self.networks:
|
||||
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
|
||||
logger.create_signal_value('Update Target Network', 1)
|
||||
logger.logger.create_signal_value('Update Target Network', 1)
|
||||
else:
|
||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return loss
|
||||
|
||||
@@ -321,7 +316,7 @@ class Agent(object):
|
||||
plt.legend()
|
||||
plt.pause(0.00000001)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
"""
|
||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||
or testing.
|
||||
@@ -351,15 +346,15 @@ class Agent(object):
|
||||
for input_name in self.tp.agent.input_types.keys():
|
||||
input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0)
|
||||
return input_state
|
||||
|
||||
|
||||
def prepare_initial_state(self):
|
||||
"""
|
||||
Create an initial state when starting a new episode
|
||||
:return: None
|
||||
"""
|
||||
observation = self.preprocess_observation(self.env.state['observation'])
|
||||
self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
|
||||
observation = LazyStack(self.curr_stack, -1)
|
||||
self.curr_stack = collections.deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
|
||||
observation = utils.LazyStack(self.curr_stack, -1)
|
||||
|
||||
self.curr_state = {
|
||||
'observation': observation
|
||||
@@ -369,21 +364,21 @@ class Agent(object):
|
||||
if self.tp.agent.use_accumulated_reward_as_measurement:
|
||||
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
|
||||
|
||||
def act(self, phase=RunPhase.TRAIN):
|
||||
def act(self, phase=utils.RunPhase.TRAIN):
|
||||
"""
|
||||
Take one step in the environment according to the network prediction and store the transition in memory
|
||||
:param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
|
||||
:return: A boolean value that signals an episode termination
|
||||
"""
|
||||
|
||||
if phase != RunPhase.TEST:
|
||||
if phase != utils.RunPhase.TEST:
|
||||
self.total_steps_counter += 1
|
||||
self.current_episode_steps_counter += 1
|
||||
|
||||
# get new action
|
||||
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0}
|
||||
|
||||
if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
|
||||
if phase == utils.RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
|
||||
action = self.env.get_random_action()
|
||||
else:
|
||||
action, action_info = self.choose_action(self.curr_state, phase=phase)
|
||||
@@ -402,13 +397,13 @@ class Agent(object):
|
||||
next_state['observation'] = self.preprocess_observation(next_state['observation'])
|
||||
|
||||
# plot action values online
|
||||
if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
|
||||
if self.tp.visualization.plot_action_values_online and phase != utils.RunPhase.HEATUP:
|
||||
self.plot_action_values_online()
|
||||
|
||||
# initialize the next state
|
||||
# TODO: provide option to stack more than just the observation
|
||||
self.curr_stack.append(next_state['observation'])
|
||||
observation = LazyStack(self.curr_stack, -1)
|
||||
observation = utils.LazyStack(self.curr_stack, -1)
|
||||
|
||||
next_state['observation'] = observation
|
||||
if self.tp.agent.use_measurements and 'measurements' in result.keys():
|
||||
@@ -417,14 +412,14 @@ class Agent(object):
|
||||
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
|
||||
|
||||
# store the transition only if we are training
|
||||
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
|
||||
transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
|
||||
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
|
||||
transition = memory.Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
|
||||
for key in action_info.keys():
|
||||
transition.info[key] = action_info[key]
|
||||
if self.tp.agent.add_a_normalized_timestep_to_the_observation:
|
||||
transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
|
||||
self.memory.store(transition)
|
||||
elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
|
||||
elif phase == utils.RunPhase.TEST and self.tp.visualization.dump_gifs:
|
||||
# we store the transitions only for saving gifs
|
||||
self.last_episode_images.append(self.env.get_rendered_image())
|
||||
|
||||
@@ -437,7 +432,7 @@ class Agent(object):
|
||||
self.update_log(phase=phase)
|
||||
self.log_to_screen(phase=phase)
|
||||
|
||||
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
|
||||
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
|
||||
self.reset_game()
|
||||
|
||||
self.current_episode += 1
|
||||
@@ -456,8 +451,8 @@ class Agent(object):
|
||||
|
||||
max_reward_achieved = -float('inf')
|
||||
average_evaluation_reward = 0
|
||||
screen.log_title("Running evaluation")
|
||||
self.env.change_phase(RunPhase.TEST)
|
||||
logger.screen.log_title("Running evaluation")
|
||||
self.env.change_phase(utils.RunPhase.TEST)
|
||||
for i in range(num_episodes):
|
||||
# keep the online network in sync with the global network
|
||||
if keep_networks_synced:
|
||||
@@ -466,7 +461,7 @@ class Agent(object):
|
||||
|
||||
episode_ended = False
|
||||
while not episode_ended:
|
||||
episode_ended = self.act(phase=RunPhase.TEST)
|
||||
episode_ended = self.act(phase=utils.RunPhase.TEST)
|
||||
|
||||
if keep_networks_synced \
|
||||
and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps:
|
||||
@@ -477,7 +472,7 @@ class Agent(object):
|
||||
max_reward_achieved = self.total_reward_in_current_episode
|
||||
frame_skipping = int(5/self.tp.env.frame_skip)
|
||||
if self.tp.visualization.dump_gifs:
|
||||
logger.create_gif(self.last_episode_images[::frame_skipping],
|
||||
logger.logger.create_gif(self.last_episode_images[::frame_skipping],
|
||||
name='score-{}'.format(max_reward_achieved), fps=10)
|
||||
|
||||
average_evaluation_reward += self.total_reward_in_current_episode
|
||||
@@ -485,8 +480,8 @@ class Agent(object):
|
||||
|
||||
average_evaluation_reward /= float(num_episodes)
|
||||
|
||||
self.env.change_phase(RunPhase.TRAIN)
|
||||
screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
|
||||
self.env.change_phase(utils.RunPhase.TRAIN)
|
||||
logger.screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
|
||||
|
||||
def post_training_commands(self):
|
||||
pass
|
||||
@@ -505,15 +500,15 @@ class Agent(object):
|
||||
# heatup phase
|
||||
if self.tp.num_heatup_steps != 0:
|
||||
self.in_heatup = True
|
||||
screen.log_title("Starting heatup {}".format(self.task_id))
|
||||
logger.screen.log_title("Starting heatup {}".format(self.task_id))
|
||||
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
|
||||
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
|
||||
self.act(phase=RunPhase.HEATUP)
|
||||
self.act(phase=utils.RunPhase.HEATUP)
|
||||
|
||||
# training phase
|
||||
self.in_heatup = False
|
||||
screen.log_title("Starting training {}".format(self.task_id))
|
||||
self.exploration_policy.change_phase(RunPhase.TRAIN)
|
||||
logger.screen.log_title("Starting training {}".format(self.task_id))
|
||||
self.exploration_policy.change_phase(utils.RunPhase.TRAIN)
|
||||
training_start_time = time.time()
|
||||
model_snapshots_periods_passed = -1
|
||||
self.reset_game()
|
||||
@@ -557,7 +552,7 @@ class Agent(object):
|
||||
self.loss.add_sample(loss)
|
||||
self.training_iteration += 1
|
||||
if self.imitation:
|
||||
self.log_to_screen(RunPhase.TRAIN)
|
||||
self.log_to_screen(utils.RunPhase.TRAIN)
|
||||
self.post_training_commands()
|
||||
|
||||
def save_model(self, model_id):
|
||||
|
||||
@@ -13,16 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
|
||||
from agents.imitation_agent import ImitationAgent
|
||||
from agents import imitation_agent
|
||||
|
||||
|
||||
# Behavioral Cloning Agent
|
||||
class BCAgent(ImitationAgent):
|
||||
class BCAgent(imitation_agent.ImitationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
imitation_agent.ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, _, actions, _, _, _ = self.extract_batch(batch)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,17 +13,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
|
||||
from agents import value_optimization_agent as voa
|
||||
import utils
|
||||
|
||||
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
||||
class BootstrappedDQNAgent(ValueOptimizationAgent):
|
||||
class BootstrappedDQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def reset_game(self, do_not_reset_env=False):
|
||||
ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
||||
voa.ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
||||
self.exploration_policy.select_head()
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
@@ -51,8 +52,8 @@ class BootstrappedDQNAgent(ValueOptimizationAgent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def act(self, phase=RunPhase.TRAIN):
|
||||
ValueOptimizationAgent.act(self, phase)
|
||||
def act(self, phase=utils.RunPhase.TRAIN):
|
||||
voa.ValueOptimizationAgent.act(self, phase)
|
||||
mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
|
||||
self.tp.exploration.architecture_num_q_heads)
|
||||
self.memory.update_last_transition_info({'mask': mask})
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||
class CategoricalDQNAgent(ValueOptimizationAgent):
|
||||
class CategoricalDQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
@@ -57,4 +58,3 @@ class CategoricalDQNAgent(ValueOptimizationAgent):
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
|
||||
@@ -13,27 +13,34 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
import collections
|
||||
import copy
|
||||
from random import shuffle
|
||||
|
||||
import numpy as np
|
||||
|
||||
from agents import actor_critic_agent as aca
|
||||
from agents import policy_optimization_agent as poa
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
||||
class ClippedPPOAgent(ActorCriticAgent):
|
||||
class ClippedPPOAgent(aca.ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
# signals definition
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.value_loss = utils.Signal('Value Loss')
|
||||
self.signals.append(self.value_loss)
|
||||
self.policy_loss = Signal('Policy Loss')
|
||||
self.policy_loss = utils.Signal('Policy Loss')
|
||||
self.signals.append(self.policy_loss)
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||
self.signals.append(self.unclipped_grads)
|
||||
self.value_targets = Signal('Value Targets')
|
||||
self.value_targets = utils.Signal('Value Targets')
|
||||
self.signals.append(self.value_targets)
|
||||
self.kl_divergence = Signal('KL Divergence')
|
||||
self.kl_divergence = utils.Signal('KL Divergence')
|
||||
self.signals.append(self.kl_divergence)
|
||||
|
||||
def fill_advantages(self, batch):
|
||||
@@ -46,9 +53,9 @@ class ClippedPPOAgent(ActorCriticAgent):
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
value_targets = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||
advantages = total_return - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
@@ -66,7 +73,7 @@ class ClippedPPOAgent(ActorCriticAgent):
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
value_targets = np.append(value_targets, gae_based_value_targets)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
@@ -144,8 +151,8 @@ class ClippedPPOAgent(ActorCriticAgent):
|
||||
curr_learning_rate = self.tp.learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
logger.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
@@ -184,13 +191,13 @@ class ClippedPPOAgent(ActorCriticAgent):
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(losses[0], losses[1])
|
||||
|
||||
def choose_action(self, current_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, current_state, phase=utils.RunPhase.TRAIN):
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
_, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
||||
action_values = action_values.squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
@@ -201,7 +208,7 @@ class ClippedPPOAgent(ActorCriticAgent):
|
||||
_, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
# if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
|
||||
# print action
|
||||
|
||||
@@ -13,28 +13,34 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import copy
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from configurations import *
|
||||
import numpy as np
|
||||
|
||||
from agents import actor_critic_agent as aca
|
||||
from agents import agent
|
||||
from architectures import network_wrapper as nw
|
||||
import configurations as conf
|
||||
import utils
|
||||
|
||||
|
||||
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
||||
class DDPGAgent(ActorCriticAgent):
|
||||
class DDPGAgent(aca.ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
# define critic network
|
||||
self.critic_network = self.main_network
|
||||
# self.networks.append(self.critic_network)
|
||||
|
||||
# define actor network
|
||||
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
|
||||
tuning_parameters.agent.output_types = [OutputTypes.Pi]
|
||||
self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
|
||||
self.replicated_device, self.worker_device)
|
||||
tuning_parameters.agent.input_types = {'observation': conf.InputTypes.Observation}
|
||||
tuning_parameters.agent.output_types = [conf.OutputTypes.Pi]
|
||||
self.actor_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.actor_network)
|
||||
|
||||
self.q_values = Signal("Q")
|
||||
self.q_values = utils.Signal("Q")
|
||||
self.signals.append(self.q_values)
|
||||
|
||||
self.reset_game(do_not_reset_env=True)
|
||||
@@ -82,14 +88,14 @@ class DDPGAgent(ActorCriticAgent):
|
||||
return total_loss
|
||||
|
||||
def train(self):
|
||||
return Agent.train(self)
|
||||
return agent.Agent.train(self)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
|
||||
result = self.actor_network.online_network.predict(self.tf_input_state(curr_state))
|
||||
action_values = result[0].squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Double DQN - https://arxiv.org/abs/1509.06461
|
||||
class DDQNAgent(ValueOptimizationAgent):
|
||||
class DDQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
@@ -13,17 +13,20 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.agent import *
|
||||
from agents import agent
|
||||
from architectures import network_wrapper as nw
|
||||
import utils
|
||||
|
||||
|
||||
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
||||
class DFPAgent(Agent):
|
||||
class DFPAgent(agent.Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.current_goal = self.tp.agent.goal_vector
|
||||
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
@@ -45,7 +48,7 @@ class DFPAgent(Agent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
|
||||
@@ -66,7 +69,7 @@ class DFPAgent(Agent):
|
||||
self.tp.agent.future_measurements_weights)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||
class DistributionalDQNAgent(ValueOptimizationAgent):
|
||||
class DistributionalDQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
@@ -57,4 +58,3 @@ class DistributionalDQNAgent(ValueOptimizationAgent):
|
||||
total_loss = result[0]
|
||||
|
||||
return total_loss
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
||||
class DQNAgent(ValueOptimizationAgent):
|
||||
class DQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||
|
||||
@@ -13,31 +13,37 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import collections
|
||||
import os
|
||||
|
||||
from agents.agent import *
|
||||
import pygame
|
||||
from pandas.io import pickle
|
||||
|
||||
from agents import agent
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
class HumanAgent(Agent):
|
||||
class HumanAgent(agent.Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
|
||||
self.clock = pygame.time.Clock()
|
||||
self.max_fps = int(self.tp.visualization.max_fps_for_human_control)
|
||||
|
||||
screen.log_title("Human Control Mode")
|
||||
utils.screen.log_title("Human Control Mode")
|
||||
available_keys = self.env.get_available_keys()
|
||||
if available_keys:
|
||||
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
||||
screen.log("")
|
||||
utils.screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
||||
utils.screen.log("")
|
||||
for action, key in self.env.get_available_keys():
|
||||
screen.log("\t- {}: {}".format(action, key))
|
||||
screen.separator()
|
||||
utils.screen.log("\t- {}: {}".format(action, key))
|
||||
utils.screen.separator()
|
||||
|
||||
def train(self):
|
||||
return 0
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
action = self.env.get_action_from_user()
|
||||
|
||||
# keep constant fps
|
||||
@@ -49,16 +55,16 @@ class HumanAgent(Agent):
|
||||
return action, {"action_value": 0}
|
||||
|
||||
def save_replay_buffer_and_exit(self):
|
||||
replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p')
|
||||
replay_buffer_path = os.path.join(logger.logger.experiments_path, 'replay_buffer.p')
|
||||
self.memory.tp = None
|
||||
to_pickle(self.memory, replay_buffer_path)
|
||||
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
||||
pickle.to_pickle(self.memory, replay_buffer_path)
|
||||
utils.screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
||||
exit()
|
||||
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
# log to utils.screen
|
||||
utils.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Episode", self.current_episode),
|
||||
("total reward", self.total_reward_in_current_episode),
|
||||
("steps", self.total_steps_counter)
|
||||
|
||||
@@ -13,23 +13,27 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import collections
|
||||
|
||||
from agents.agent import *
|
||||
from agents import agent
|
||||
from architectures import network_wrapper as nw
|
||||
import utils
|
||||
import logging
|
||||
|
||||
|
||||
# Imitation Agent
|
||||
class ImitationAgent(Agent):
|
||||
class ImitationAgent(agent.Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
self.imitation = True
|
||||
|
||||
def extract_action_values(self, prediction):
|
||||
return prediction.squeeze()
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
||||
|
||||
@@ -49,10 +53,10 @@ class ImitationAgent(Agent):
|
||||
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
# for the training phase - we log during the episode to visualize the progress in training
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
logging.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Worker", self.task_id),
|
||||
("Episode", self.current_episode),
|
||||
("Loss", self.loss.values[-1]),
|
||||
@@ -62,4 +66,4 @@ class ImitationAgent(Agent):
|
||||
)
|
||||
else:
|
||||
# for the evaluation phase - logging as in regular RL
|
||||
Agent.log_to_screen(self, phase)
|
||||
agent.Agent.log_to_screen(self, phase)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,13 +13,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
class MixedMonteCarloAgent(ValueOptimizationAgent):
|
||||
class MixedMonteCarloAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
|
||||
@@ -14,22 +14,21 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
|
||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||
from logger import logger
|
||||
from utils import Signal, last_sample
|
||||
from agents import value_optimization_agent as voa
|
||||
from agents import policy_optimization_agent as poa
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
||||
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
class NStepQAgent(voa.ValueOptimizationAgent, poa.PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
self.q_values = Signal('Q Values')
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.q_values = utils.Signal('Q Values')
|
||||
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||
self.value_loss = utils.Signal('Value Loss')
|
||||
self.signals.append(self.q_values)
|
||||
self.signals.append(self.unclipped_grads)
|
||||
self.signals.append(self.value_loss)
|
||||
@@ -57,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
if game_overs[-1]:
|
||||
R = 0
|
||||
else:
|
||||
R = np.max(self.main_network.target_network.predict(last_sample(next_states)))
|
||||
R = np.max(self.main_network.target_network.predict(utils.last_sample(next_states)))
|
||||
|
||||
for i in reversed(range(num_transitions)):
|
||||
R = rewards[i] + self.tp.agent.discount * R
|
||||
@@ -85,4 +84,4 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
||||
else:
|
||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||
|
||||
return PolicyOptimizationAgent.train(self)
|
||||
return poa.PolicyOptimizationAgent.train(self)
|
||||
|
||||
@@ -13,21 +13,20 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from utils import RunPhase, Signal
|
||||
import utils
|
||||
|
||||
|
||||
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
||||
class NAFAgent(ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.l_values = Signal("L")
|
||||
self.a_values = Signal("Advantage")
|
||||
self.mu_values = Signal("Action")
|
||||
self.v_values = Signal("V")
|
||||
self.l_values = utils.Signal("L")
|
||||
self.a_values = utils.Signal("Advantage")
|
||||
self.mu_values = utils.Signal("Action")
|
||||
self.v_values = utils.Signal("V")
|
||||
self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
|
||||
|
||||
def learn_from_batch(self, batch):
|
||||
@@ -49,7 +48,7 @@ class NAFAgent(ValueOptimizationAgent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
|
||||
|
||||
# convert to batch so we can run it through the network
|
||||
@@ -60,7 +59,7 @@ class NAFAgent(ValueOptimizationAgent):
|
||||
outputs=naf_head.mu,
|
||||
squeeze_output=False,
|
||||
)
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
|
||||
@@ -13,19 +13,16 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||
from agents import value_optimization_agent as voa
|
||||
from logger import screen
|
||||
from utils import RunPhase
|
||||
import utils
|
||||
|
||||
|
||||
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
||||
class NECAgent(ValueOptimizationAgent):
|
||||
class NECAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=False)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=False)
|
||||
self.current_episode_state_embeddings = []
|
||||
self.training_started = False
|
||||
|
||||
@@ -52,7 +49,7 @@ class NECAgent(ValueOptimizationAgent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def act(self, phase=RunPhase.TRAIN):
|
||||
def act(self, phase=utils.RunPhase.TRAIN):
|
||||
if self.in_heatup:
|
||||
# get embedding in heatup (otherwise we get it through choose_action)
|
||||
embedding = self.main_network.online_network.predict(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
||||
class PALAgent(ValueOptimizationAgent):
|
||||
class PALAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.alpha = tuning_parameters.agent.pal_alpha
|
||||
self.persistent = tuning_parameters.agent.persistent_advantage_learning
|
||||
self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||
|
||||
@@ -13,25 +13,18 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from agents.policy_optimization_agent import *
|
||||
import numpy as np
|
||||
from logger import *
|
||||
import tensorflow as tf
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
except:
|
||||
from logger import failed_imports
|
||||
failed_imports.append("matplotlib")
|
||||
|
||||
from utils import *
|
||||
from agents import policy_optimization_agent as poa
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
class PolicyGradientsAgent(poa.PolicyOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.returns_mean = Signal('Returns Mean')
|
||||
self.returns_variance = Signal('Returns Variance')
|
||||
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.returns_mean = utils.Signal('Returns Mean')
|
||||
self.returns_variance = utils.Signal('Returns Variance')
|
||||
self.signals.append(self.returns_mean)
|
||||
self.signals.append(self.returns_variance)
|
||||
self.last_gradient_update_step_idx = 0
|
||||
@@ -41,21 +34,21 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
|
||||
|
||||
for i in reversed(range(len(total_returns))):
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
|
||||
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.TOTAL_RETURN:
|
||||
total_returns[i] = total_returns[0]
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN:
|
||||
# just take the total return as it is
|
||||
pass
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
||||
if self.std_discounted_return != 0:
|
||||
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
||||
else:
|
||||
total_returns[i] = 0
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
targets = total_returns
|
||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||
@@ -69,12 +62,12 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
|
||||
return total_loss
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
# convert to batch so we can run it through the network
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
@@ -84,7 +77,7 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
||||
# CONTINUOUS
|
||||
result = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
||||
action_values = result[0].squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = action_values
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -13,12 +13,17 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import collections
|
||||
|
||||
from agents.agent import *
|
||||
from memories.memory import Episode
|
||||
import numpy as np
|
||||
|
||||
from agents import agent
|
||||
from architectures import network_wrapper as nw
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
class PolicyGradientRescaler(Enum):
|
||||
class PolicyGradientRescaler(utils.Enum):
|
||||
TOTAL_RETURN = 0
|
||||
FUTURE_RETURN = 1
|
||||
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
||||
@@ -30,11 +35,11 @@ class PolicyGradientRescaler(Enum):
|
||||
GAE = 8
|
||||
|
||||
|
||||
class PolicyOptimizationAgent(Agent):
|
||||
class PolicyOptimizationAgent(agent.Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
|
||||
self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler)
|
||||
@@ -44,7 +49,7 @@ class PolicyOptimizationAgent(Agent):
|
||||
self.max_episode_length = 100000
|
||||
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
||||
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
||||
self.entropy = Signal('Entropy')
|
||||
self.entropy = utils.Signal('Entropy')
|
||||
self.signals.append(self.entropy)
|
||||
|
||||
self.reset_game(do_not_reset_env=True)
|
||||
@@ -52,8 +57,8 @@ class PolicyOptimizationAgent(Agent):
|
||||
def log_to_screen(self, phase):
|
||||
# log to screen
|
||||
if self.current_episode > 0:
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
logger.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Worker", self.task_id),
|
||||
("Episode", self.current_episode),
|
||||
("total reward", self.total_reward_in_current_episode),
|
||||
|
||||
@@ -13,36 +13,44 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import collections
|
||||
import copy
|
||||
|
||||
from agents.actor_critic_agent import *
|
||||
from random import shuffle
|
||||
import numpy as np
|
||||
|
||||
from agents import actor_critic_agent as aca
|
||||
from agents import policy_optimization_agent as poa
|
||||
from architectures import network_wrapper as nw
|
||||
import configurations
|
||||
import logger
|
||||
import utils
|
||||
|
||||
|
||||
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
|
||||
class PPOAgent(ActorCriticAgent):
|
||||
class PPOAgent(aca.ActorCriticAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||
create_target_network=True)
|
||||
self.critic_network = self.main_network
|
||||
|
||||
# define the policy network
|
||||
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
|
||||
tuning_parameters.agent.output_types = [OutputTypes.PPO]
|
||||
tuning_parameters.agent.input_types = {'observation': configurations.InputTypes.Observation}
|
||||
tuning_parameters.agent.output_types = [configurations.OutputTypes.PPO]
|
||||
tuning_parameters.agent.optimizer_type = 'Adam'
|
||||
tuning_parameters.agent.l2_regularization = 0
|
||||
self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.policy_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.policy_network)
|
||||
|
||||
# signals definition
|
||||
self.value_loss = Signal('Value Loss')
|
||||
self.value_loss = utils.Signal('Value Loss')
|
||||
self.signals.append(self.value_loss)
|
||||
self.policy_loss = Signal('Policy Loss')
|
||||
self.policy_loss = utils.Signal('Policy Loss')
|
||||
self.signals.append(self.policy_loss)
|
||||
self.kl_divergence = Signal('KL Divergence')
|
||||
self.kl_divergence = utils.Signal('KL Divergence')
|
||||
self.signals.append(self.kl_divergence)
|
||||
self.total_kl_divergence_during_training_process = 0.0
|
||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
||||
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||
self.signals.append(self.unclipped_grads)
|
||||
|
||||
self.reset_game(do_not_reset_env=True)
|
||||
@@ -57,9 +65,9 @@ class PPOAgent(ActorCriticAgent):
|
||||
|
||||
# calculate advantages
|
||||
advantages = []
|
||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
||||
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||
advantages = total_return - current_state_values
|
||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
||||
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||
# get bootstraps
|
||||
episode_start_idx = 0
|
||||
advantages = np.array([])
|
||||
@@ -76,7 +84,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
episode_start_idx = idx + 1
|
||||
advantages = np.append(advantages, rollout_advantages)
|
||||
else:
|
||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||
|
||||
# standardize
|
||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||
@@ -107,7 +115,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
for k, v in current_states.items()
|
||||
}
|
||||
total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
|
||||
old_policy_values = force_list(self.critic_network.target_network.predict(
|
||||
old_policy_values = utils.force_list(self.critic_network.target_network.predict(
|
||||
current_states_batch).squeeze())
|
||||
if self.critic_network.online_network.optimizer_type != 'LBFGS':
|
||||
targets = total_return_batch
|
||||
@@ -155,7 +163,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
actions = np.expand_dims(actions, -1)
|
||||
|
||||
# get old policy probabilities and distribution
|
||||
old_policy = force_list(self.policy_network.target_network.predict(current_states))
|
||||
old_policy = utils.force_list(self.policy_network.target_network.predict(current_states))
|
||||
|
||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||
fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
|
||||
@@ -196,8 +204,8 @@ class PPOAgent(ActorCriticAgent):
|
||||
curr_learning_rate = self.tp.learning_rate
|
||||
|
||||
# log training parameters
|
||||
screen.log_dict(
|
||||
OrderedDict([
|
||||
logger.screen.log_dict(
|
||||
collections.OrderedDict([
|
||||
("Surrogate loss", loss['policy_losses'][0]),
|
||||
("KL divergence", loss['fetch_result'][0]),
|
||||
("Entropy", loss['fetch_result'][1]),
|
||||
@@ -215,7 +223,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
def update_kl_coefficient(self):
|
||||
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
||||
# his implementation for now because we know it works well
|
||||
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||
logger.screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||
|
||||
# update kl coefficient
|
||||
kl_target = self.tp.agent.target_kl_divergence
|
||||
@@ -236,7 +244,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
new_kl_coefficient,
|
||||
self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
|
||||
|
||||
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
||||
logger.screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
||||
|
||||
def post_training_commands(self):
|
||||
if self.tp.agent.use_kl_regularization:
|
||||
@@ -264,12 +272,12 @@ class PPOAgent(ActorCriticAgent):
|
||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||
return np.append(value_loss, policy_loss)
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
if self.env.discrete_controls:
|
||||
# DISCRETE
|
||||
action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
||||
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = self.exploration_policy.get_action(action_values)
|
||||
else:
|
||||
action = np.argmax(action_values)
|
||||
@@ -280,7 +288,7 @@ class PPOAgent(ActorCriticAgent):
|
||||
action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state))
|
||||
action_values_mean = action_values_mean.squeeze()
|
||||
action_values_std = action_values_std.squeeze()
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||
else:
|
||||
action = action_values_mean
|
||||
|
||||
@@ -13,14 +13,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import numpy as np
|
||||
|
||||
from agents.value_optimization_agent import *
|
||||
from agents import value_optimization_agent as voa
|
||||
|
||||
|
||||
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
|
||||
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
|
||||
class QuantileRegressionDQNAgent(voa.ValueOptimizationAgent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms)
|
||||
|
||||
# prediction's format is (batch,actions,atoms)
|
||||
|
||||
@@ -13,21 +13,20 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import numpy as np
|
||||
|
||||
from agents.agent import Agent
|
||||
from architectures.network_wrapper import NetworkWrapper
|
||||
from utils import RunPhase, Signal
|
||||
from agents import agent
|
||||
from architectures import network_wrapper as nw
|
||||
import utils
|
||||
|
||||
|
||||
class ValueOptimizationAgent(Agent):
|
||||
class ValueOptimizationAgent(agent.Agent):
|
||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
|
||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||
self.replicated_device, self.worker_device)
|
||||
self.networks.append(self.main_network)
|
||||
self.q_values = Signal("Q")
|
||||
self.q_values = utils.Signal("Q")
|
||||
self.signals.append(self.q_values)
|
||||
|
||||
self.reset_game(do_not_reset_env=True)
|
||||
@@ -47,12 +46,12 @@ class ValueOptimizationAgent(Agent):
|
||||
'require exploration policies which return a single action.'
|
||||
).format(policy.__class__.__name__))
|
||||
|
||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
||||
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||
prediction = self.get_prediction(curr_state)
|
||||
actions_q_values = self.get_q_values(prediction)
|
||||
|
||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||
if phase == RunPhase.TRAIN:
|
||||
if phase == utils.RunPhase.TRAIN:
|
||||
exploration_policy = self.exploration_policy
|
||||
else:
|
||||
exploration_policy = self.evaluation_exploration_policy
|
||||
|
||||
Reference in New Issue
Block a user