mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 19:20:19 +01:00
Cleanup imports.
Till now, most of the modules were importing all of the module objects (variables, classes, functions, other imports) into module namespace, which potentially could (and was) cause of unintentional use of class or methods, which was indirect imported. With this patch, all the star imports were substituted with top-level module, which provides desired class or function. Besides, all imports where sorted (where possible) in a way pep8[1] suggests - first are imports from standard library, than goes third party imports (like numpy, tensorflow etc) and finally coach modules. All of those sections are separated by one empty line. [1] https://www.python.org/dev/peps/pep-0008/#imports
This commit is contained in:
@@ -13,26 +13,48 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from agents.actor_critic_agent import ActorCriticAgent
|
||||||
|
from agents.agent import Agent
|
||||||
|
from agents.bc_agent import BCAgent
|
||||||
|
from agents.bootstrapped_dqn_agent import BootstrappedDQNAgent
|
||||||
|
from agents.categorical_dqn_agent import CategoricalDQNAgent
|
||||||
|
from agents.clipped_ppo_agent import ClippedPPOAgent
|
||||||
|
from agents.ddpg_agent import DDPGAgent
|
||||||
|
from agents.ddqn_agent import DDQNAgent
|
||||||
|
from agents.dfp_agent import DFPAgent
|
||||||
|
from agents.dqn_agent import DQNAgent
|
||||||
|
from agents.human_agent import HumanAgent
|
||||||
|
from agents.imitation_agent import ImitationAgent
|
||||||
|
from agents.mmc_agent import MixedMonteCarloAgent
|
||||||
|
from agents.n_step_q_agent import NStepQAgent
|
||||||
|
from agents.naf_agent import NAFAgent
|
||||||
|
from agents.nec_agent import NECAgent
|
||||||
|
from agents.pal_agent import PALAgent
|
||||||
|
from agents.policy_gradients_agent import PolicyGradientsAgent
|
||||||
|
from agents.policy_optimization_agent import PolicyOptimizationAgent
|
||||||
|
from agents.ppo_agent import PPOAgent
|
||||||
|
from agents.qr_dqn_agent import QuantileRegressionDQNAgent
|
||||||
|
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||||
|
|
||||||
from agents.actor_critic_agent import *
|
__all__ = [ActorCriticAgent,
|
||||||
from agents.agent import *
|
Agent,
|
||||||
from agents.bc_agent import *
|
BCAgent,
|
||||||
from agents.bootstrapped_dqn_agent import *
|
BootstrappedDQNAgent,
|
||||||
from agents.clipped_ppo_agent import *
|
CategoricalDQNAgent,
|
||||||
from agents.ddpg_agent import *
|
ClippedPPOAgent,
|
||||||
from agents.ddqn_agent import *
|
DDPGAgent,
|
||||||
from agents.dfp_agent import *
|
DDQNAgent,
|
||||||
from agents.dqn_agent import *
|
DFPAgent,
|
||||||
from agents.categorical_dqn_agent import *
|
DQNAgent,
|
||||||
from agents.human_agent import *
|
HumanAgent,
|
||||||
from agents.imitation_agent import *
|
ImitationAgent,
|
||||||
from agents.mmc_agent import *
|
MixedMonteCarloAgent,
|
||||||
from agents.n_step_q_agent import *
|
NAFAgent,
|
||||||
from agents.naf_agent import *
|
NECAgent,
|
||||||
from agents.nec_agent import *
|
NStepQAgent,
|
||||||
from agents.pal_agent import *
|
PALAgent,
|
||||||
from agents.policy_gradients_agent import *
|
PPOAgent,
|
||||||
from agents.policy_optimization_agent import *
|
PolicyGradientsAgent,
|
||||||
from agents.ppo_agent import *
|
PolicyOptimizationAgent,
|
||||||
from agents.value_optimization_agent import *
|
QuantileRegressionDQNAgent,
|
||||||
from agents.qr_dqn_agent import *
|
ValueOptimizationAgent]
|
||||||
|
|||||||
@@ -13,23 +13,24 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
from scipy import signal
|
||||||
|
|
||||||
from agents.policy_optimization_agent import *
|
from agents import policy_optimization_agent as poa
|
||||||
from logger import *
|
import utils
|
||||||
from utils import *
|
import logger
|
||||||
import scipy.signal
|
|
||||||
|
|
||||||
|
|
||||||
# Actor Critic - https://arxiv.org/abs/1602.01783
|
# Actor Critic - https://arxiv.org/abs/1602.01783
|
||||||
class ActorCriticAgent(PolicyOptimizationAgent):
|
class ActorCriticAgent(poa.PolicyOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False):
|
||||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
|
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network)
|
||||||
self.last_gradient_update_step_idx = 0
|
self.last_gradient_update_step_idx = 0
|
||||||
self.action_advantages = Signal('Advantages')
|
self.action_advantages = utils.Signal('Advantages')
|
||||||
self.state_values = Signal('Values')
|
self.state_values = utils.Signal('Values')
|
||||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||||
self.value_loss = Signal('Value Loss')
|
self.value_loss = utils.Signal('Value Loss')
|
||||||
self.policy_loss = Signal('Policy Loss')
|
self.policy_loss = utils.Signal('Policy Loss')
|
||||||
self.signals.append(self.action_advantages)
|
self.signals.append(self.action_advantages)
|
||||||
self.signals.append(self.state_values)
|
self.signals.append(self.state_values)
|
||||||
self.signals.append(self.unclipped_grads)
|
self.signals.append(self.unclipped_grads)
|
||||||
@@ -38,7 +39,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
|
|
||||||
# Discounting function used to calculate discounted returns.
|
# Discounting function used to calculate discounted returns.
|
||||||
def discount(self, x, gamma):
|
def discount(self, x, gamma):
|
||||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
return signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||||
|
|
||||||
def get_general_advantage_estimation_values(self, rewards, values):
|
def get_general_advantage_estimation_values(self, rewards, values):
|
||||||
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
# values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n)
|
||||||
@@ -72,20 +73,20 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
# estimate the advantage function
|
# estimate the advantage function
|
||||||
action_advantages = np.zeros((num_transitions, 1))
|
action_advantages = np.zeros((num_transitions, 1))
|
||||||
|
|
||||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||||
if game_overs[-1]:
|
if game_overs[-1]:
|
||||||
R = 0
|
R = 0
|
||||||
else:
|
else:
|
||||||
R = self.main_network.online_network.predict(last_sample(next_states))[0]
|
R = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
|
||||||
|
|
||||||
for i in reversed(range(num_transitions)):
|
for i in reversed(range(num_transitions)):
|
||||||
R = rewards[i] + self.tp.agent.discount * R
|
R = rewards[i] + self.tp.agent.discount * R
|
||||||
state_value_head_targets[i] = R
|
state_value_head_targets[i] = R
|
||||||
action_advantages[i] = R - current_state_values[i]
|
action_advantages[i] = R - current_state_values[i]
|
||||||
|
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||||
# get bootstraps
|
# get bootstraps
|
||||||
bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0]
|
bootstrapped_value = self.main_network.online_network.predict(utils.last_sample(next_states))[0]
|
||||||
values = np.append(current_state_values, bootstrapped_value)
|
values = np.append(current_state_values, bootstrapped_value)
|
||||||
if game_overs[-1]:
|
if game_overs[-1]:
|
||||||
values[-1] = 0
|
values[-1] = 0
|
||||||
@@ -94,7 +95,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
|
gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values)
|
||||||
action_advantages = np.vstack(gae_values)
|
action_advantages = np.vstack(gae_values)
|
||||||
else:
|
else:
|
||||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||||
|
|
||||||
action_advantages = action_advantages.squeeze(axis=-1)
|
action_advantages = action_advantages.squeeze(axis=-1)
|
||||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||||
@@ -113,7 +114,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
# TODO: rename curr_state -> state
|
# TODO: rename curr_state -> state
|
||||||
|
|
||||||
# convert to batch so we can run it through the network
|
# convert to batch so we can run it through the network
|
||||||
@@ -126,7 +127,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
# DISCRETE
|
# DISCRETE
|
||||||
state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
|
state_value, action_probabilities = self.main_network.online_network.predict(curr_state)
|
||||||
action_probabilities = action_probabilities.squeeze()
|
action_probabilities = action_probabilities.squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_probabilities)
|
action = self.exploration_policy.get_action(action_probabilities)
|
||||||
else:
|
else:
|
||||||
action = np.argmax(action_probabilities)
|
action = np.argmax(action_probabilities)
|
||||||
@@ -137,7 +138,7 @@ class ActorCriticAgent(PolicyOptimizationAgent):
|
|||||||
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
|
state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state)
|
||||||
action_values_mean = action_values_mean.squeeze()
|
action_values_mean = action_values_mean.squeeze()
|
||||||
action_values_std = action_values_std.squeeze()
|
action_values_std = action_values_std.squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||||
else:
|
else:
|
||||||
action = action_values_mean
|
action = action_values_mean
|
||||||
|
|||||||
171
agents/agent.py
171
agents/agent.py
@@ -13,32 +13,28 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
import scipy.ndimage
|
|
||||||
try:
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
except:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("matplotlib")
|
|
||||||
|
|
||||||
import copy
|
|
||||||
from renderer import Renderer
|
|
||||||
from configurations import Preset
|
|
||||||
from collections import deque
|
|
||||||
from utils import LazyStack
|
|
||||||
from collections import OrderedDict
|
|
||||||
from utils import RunPhase, Signal, is_empty, RunningStat
|
|
||||||
from architectures import *
|
|
||||||
from exploration_policies import *
|
|
||||||
from memories import *
|
|
||||||
from memories.memory import *
|
|
||||||
from logger import logger, screen
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import os
|
|
||||||
import itertools
|
import logger
|
||||||
from architectures.tensorflow_components.shared_variables import SharedRunningStats
|
try:
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
except ImportError:
|
||||||
|
logger.failed_imports.append("matplotlib")
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from pandas.io import pickle
|
||||||
from six.moves import range
|
from six.moves import range
|
||||||
|
import scipy
|
||||||
|
|
||||||
|
from architectures.tensorflow_components import shared_variables as sv
|
||||||
|
import configurations
|
||||||
|
import exploration_policies as ep
|
||||||
|
import memories
|
||||||
|
from memories import memory
|
||||||
|
import renderer
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class Agent(object):
|
class Agent(object):
|
||||||
@@ -54,7 +50,7 @@ class Agent(object):
|
|||||||
:param thread_id: int
|
:param thread_id: int
|
||||||
"""
|
"""
|
||||||
|
|
||||||
screen.log_title("Creating agent {}".format(task_id))
|
logger.screen.log_title("Creating agent {}".format(task_id))
|
||||||
self.task_id = task_id
|
self.task_id = task_id
|
||||||
self.sess = tuning_parameters.sess
|
self.sess = tuning_parameters.sess
|
||||||
self.env = tuning_parameters.env_instance = env
|
self.env = tuning_parameters.env_instance = env
|
||||||
@@ -71,21 +67,20 @@ class Agent(object):
|
|||||||
|
|
||||||
# modules
|
# modules
|
||||||
if tuning_parameters.agent.load_memory_from_file_path:
|
if tuning_parameters.agent.load_memory_from_file_path:
|
||||||
screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
logger.screen.log_title("Loading replay buffer from pickle. Pickle path: {}"
|
||||||
.format(tuning_parameters.agent.load_memory_from_file_path))
|
.format(tuning_parameters.agent.load_memory_from_file_path))
|
||||||
self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path)
|
self.memory = pickle.read_pickle(tuning_parameters.agent.load_memory_from_file_path)
|
||||||
else:
|
else:
|
||||||
self.memory = eval(tuning_parameters.memory + '(tuning_parameters)')
|
self.memory = eval('memories.' + tuning_parameters.memory + '(tuning_parameters)')
|
||||||
# self.architecture = eval(tuning_parameters.architecture)
|
|
||||||
|
|
||||||
self.has_global = replicated_device is not None
|
self.has_global = replicated_device is not None
|
||||||
self.replicated_device = replicated_device
|
self.replicated_device = replicated_device
|
||||||
self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
|
self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0"
|
||||||
|
|
||||||
self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)')
|
self.exploration_policy = eval('ep.' + tuning_parameters.exploration.policy + '(tuning_parameters)')
|
||||||
self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy
|
self.evaluation_exploration_policy = eval('ep.' + tuning_parameters.exploration.evaluation_policy
|
||||||
+ '(tuning_parameters)')
|
+ '(tuning_parameters)')
|
||||||
self.evaluation_exploration_policy.change_phase(RunPhase.TEST)
|
self.evaluation_exploration_policy.change_phase(utils.RunPhase.TEST)
|
||||||
|
|
||||||
# initialize all internal variables
|
# initialize all internal variables
|
||||||
self.tp = tuning_parameters
|
self.tp = tuning_parameters
|
||||||
@@ -100,28 +95,28 @@ class Agent(object):
|
|||||||
self.episode_running_info = {}
|
self.episode_running_info = {}
|
||||||
self.last_episode_evaluation_ran = 0
|
self.last_episode_evaluation_ran = 0
|
||||||
self.running_observations = []
|
self.running_observations = []
|
||||||
logger.set_current_time(self.current_episode)
|
logger.logger.set_current_time(self.current_episode)
|
||||||
self.main_network = None
|
self.main_network = None
|
||||||
self.networks = []
|
self.networks = []
|
||||||
self.last_episode_images = []
|
self.last_episode_images = []
|
||||||
self.renderer = Renderer()
|
self.renderer = renderer.Renderer()
|
||||||
|
|
||||||
# signals
|
# signals
|
||||||
self.signals = []
|
self.signals = []
|
||||||
self.loss = Signal('Loss')
|
self.loss = utils.Signal('Loss')
|
||||||
self.signals.append(self.loss)
|
self.signals.append(self.loss)
|
||||||
self.curr_learning_rate = Signal('Learning Rate')
|
self.curr_learning_rate = utils.Signal('Learning Rate')
|
||||||
self.signals.append(self.curr_learning_rate)
|
self.signals.append(self.curr_learning_rate)
|
||||||
|
|
||||||
if self.tp.env.normalize_observation and not self.env.is_state_type_image:
|
if self.tp.env.normalize_observation and not self.env.is_state_type_image:
|
||||||
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
|
if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
|
||||||
self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,))
|
self.running_observation_stats = utils.RunningStat((self.tp.env.desired_observation_width,))
|
||||||
self.running_reward_stats = RunningStat(())
|
self.running_reward_stats = utils.RunningStat(())
|
||||||
else:
|
else:
|
||||||
self.running_observation_stats = SharedRunningStats(self.tp, replicated_device,
|
self.running_observation_stats = sv.SharedRunningStats(self.tp, replicated_device,
|
||||||
shape=(self.tp.env.desired_observation_width,),
|
shape=(self.tp.env.desired_observation_width,),
|
||||||
name='observation_stats')
|
name='observation_stats')
|
||||||
self.running_reward_stats = SharedRunningStats(self.tp, replicated_device,
|
self.running_reward_stats = sv.SharedRunningStats(self.tp, replicated_device,
|
||||||
shape=(),
|
shape=(),
|
||||||
name='reward_stats')
|
name='reward_stats')
|
||||||
|
|
||||||
@@ -137,13 +132,13 @@ class Agent(object):
|
|||||||
def log_to_screen(self, phase):
|
def log_to_screen(self, phase):
|
||||||
# log to screen
|
# log to screen
|
||||||
if self.current_episode >= 0:
|
if self.current_episode >= 0:
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
exploration = self.exploration_policy.get_control_param()
|
exploration = self.exploration_policy.get_control_param()
|
||||||
else:
|
else:
|
||||||
exploration = self.evaluation_exploration_policy.get_control_param()
|
exploration = self.evaluation_exploration_policy.get_control_param()
|
||||||
|
|
||||||
screen.log_dict(
|
logger.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Worker", self.task_id),
|
("Worker", self.task_id),
|
||||||
("Episode", self.current_episode),
|
("Episode", self.current_episode),
|
||||||
("total reward", self.total_reward_in_current_episode),
|
("total reward", self.total_reward_in_current_episode),
|
||||||
@@ -154,37 +149,37 @@ class Agent(object):
|
|||||||
prefix=phase
|
prefix=phase
|
||||||
)
|
)
|
||||||
|
|
||||||
def update_log(self, phase=RunPhase.TRAIN):
|
def update_log(self, phase=utils.RunPhase.TRAIN):
|
||||||
"""
|
"""
|
||||||
Writes logging messages to screen and updates the log file with all the signal values.
|
Writes logging messages to screen and updates the log file with all the signal values.
|
||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
# log all the signals to file
|
# log all the signals to file
|
||||||
logger.set_current_time(self.current_episode)
|
logger.logger.set_current_time(self.current_episode)
|
||||||
logger.create_signal_value('Training Iter', self.training_iteration)
|
logger.logger.create_signal_value('Training Iter', self.training_iteration)
|
||||||
logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
|
logger.logger.create_signal_value('In Heatup', int(phase == utils.RunPhase.HEATUP))
|
||||||
logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
|
logger.logger.create_signal_value('ER #Transitions', self.memory.num_transitions())
|
||||||
logger.create_signal_value('ER #Episodes', self.memory.length())
|
logger.logger.create_signal_value('ER #Episodes', self.memory.length())
|
||||||
logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
logger.logger.create_signal_value('Episode Length', self.current_episode_steps_counter)
|
||||||
logger.create_signal_value('Total steps', self.total_steps_counter)
|
logger.logger.create_signal_value('Total steps', self.total_steps_counter)
|
||||||
logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
logger.logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param())
|
||||||
logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
logger.logger.create_signal_value("Training Reward", self.total_reward_in_current_episode
|
||||||
if phase == RunPhase.TRAIN else np.nan)
|
if phase == utils.RunPhase.TRAIN else np.nan)
|
||||||
logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
|
logger.logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode
|
||||||
if phase == RunPhase.TEST else np.nan)
|
if phase == utils.RunPhase.TEST else np.nan)
|
||||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||||
logger.update_wall_clock_time(self.current_episode)
|
logger.logger.update_wall_clock_time(self.current_episode)
|
||||||
|
|
||||||
for signal in self.signals:
|
for signal in self.signals:
|
||||||
logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
logger.logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean())
|
||||||
logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
logger.logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev())
|
||||||
logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
logger.logger.create_signal_value("{}/Max".format(signal.name), signal.get_max())
|
||||||
logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
logger.logger.create_signal_value("{}/Min".format(signal.name), signal.get_min())
|
||||||
|
|
||||||
# dump
|
# dump
|
||||||
if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
|
if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
|
||||||
and self.current_episode > 0:
|
and self.current_episode > 0:
|
||||||
logger.dump_output_csv()
|
logger.logger.dump_output_csv()
|
||||||
|
|
||||||
def reset_game(self, do_not_reset_env=False):
|
def reset_game(self, do_not_reset_env=False):
|
||||||
"""
|
"""
|
||||||
@@ -211,7 +206,7 @@ class Agent(object):
|
|||||||
self.episode_running_info[action] = []
|
self.episode_running_info[action] = []
|
||||||
plt.clf()
|
plt.clf()
|
||||||
|
|
||||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
if self.tp.agent.middleware_type == configurations.MiddlewareTypes.LSTM:
|
||||||
for network in self.networks:
|
for network in self.networks:
|
||||||
network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
|
network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
|
||||||
network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init
|
network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init
|
||||||
@@ -281,9 +276,9 @@ class Agent(object):
|
|||||||
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
|
if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
|
||||||
for network in self.networks:
|
for network in self.networks:
|
||||||
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
|
network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target)
|
||||||
logger.create_signal_value('Update Target Network', 1)
|
logger.logger.create_signal_value('Update Target Network', 1)
|
||||||
else:
|
else:
|
||||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
logger.logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||||
|
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
@@ -321,7 +316,7 @@ class Agent(object):
|
|||||||
plt.legend()
|
plt.legend()
|
||||||
plt.pause(0.00000001)
|
plt.pause(0.00000001)
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
"""
|
"""
|
||||||
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
choose an action to act with in the current episode being played. Different behavior might be exhibited when training
|
||||||
or testing.
|
or testing.
|
||||||
@@ -358,8 +353,8 @@ class Agent(object):
|
|||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
observation = self.preprocess_observation(self.env.state['observation'])
|
observation = self.preprocess_observation(self.env.state['observation'])
|
||||||
self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
|
self.curr_stack = collections.deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size)
|
||||||
observation = LazyStack(self.curr_stack, -1)
|
observation = utils.LazyStack(self.curr_stack, -1)
|
||||||
|
|
||||||
self.curr_state = {
|
self.curr_state = {
|
||||||
'observation': observation
|
'observation': observation
|
||||||
@@ -369,21 +364,21 @@ class Agent(object):
|
|||||||
if self.tp.agent.use_accumulated_reward_as_measurement:
|
if self.tp.agent.use_accumulated_reward_as_measurement:
|
||||||
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
|
self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0)
|
||||||
|
|
||||||
def act(self, phase=RunPhase.TRAIN):
|
def act(self, phase=utils.RunPhase.TRAIN):
|
||||||
"""
|
"""
|
||||||
Take one step in the environment according to the network prediction and store the transition in memory
|
Take one step in the environment according to the network prediction and store the transition in memory
|
||||||
:param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
|
:param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
|
||||||
:return: A boolean value that signals an episode termination
|
:return: A boolean value that signals an episode termination
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if phase != RunPhase.TEST:
|
if phase != utils.RunPhase.TEST:
|
||||||
self.total_steps_counter += 1
|
self.total_steps_counter += 1
|
||||||
self.current_episode_steps_counter += 1
|
self.current_episode_steps_counter += 1
|
||||||
|
|
||||||
# get new action
|
# get new action
|
||||||
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0}
|
action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0}
|
||||||
|
|
||||||
if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
|
if phase == utils.RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
|
||||||
action = self.env.get_random_action()
|
action = self.env.get_random_action()
|
||||||
else:
|
else:
|
||||||
action, action_info = self.choose_action(self.curr_state, phase=phase)
|
action, action_info = self.choose_action(self.curr_state, phase=phase)
|
||||||
@@ -402,13 +397,13 @@ class Agent(object):
|
|||||||
next_state['observation'] = self.preprocess_observation(next_state['observation'])
|
next_state['observation'] = self.preprocess_observation(next_state['observation'])
|
||||||
|
|
||||||
# plot action values online
|
# plot action values online
|
||||||
if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
|
if self.tp.visualization.plot_action_values_online and phase != utils.RunPhase.HEATUP:
|
||||||
self.plot_action_values_online()
|
self.plot_action_values_online()
|
||||||
|
|
||||||
# initialize the next state
|
# initialize the next state
|
||||||
# TODO: provide option to stack more than just the observation
|
# TODO: provide option to stack more than just the observation
|
||||||
self.curr_stack.append(next_state['observation'])
|
self.curr_stack.append(next_state['observation'])
|
||||||
observation = LazyStack(self.curr_stack, -1)
|
observation = utils.LazyStack(self.curr_stack, -1)
|
||||||
|
|
||||||
next_state['observation'] = observation
|
next_state['observation'] = observation
|
||||||
if self.tp.agent.use_measurements and 'measurements' in result.keys():
|
if self.tp.agent.use_measurements and 'measurements' in result.keys():
|
||||||
@@ -417,14 +412,14 @@ class Agent(object):
|
|||||||
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
|
next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode)
|
||||||
|
|
||||||
# store the transition only if we are training
|
# store the transition only if we are training
|
||||||
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
|
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
|
||||||
transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
|
transition = memory.Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done'])
|
||||||
for key in action_info.keys():
|
for key in action_info.keys():
|
||||||
transition.info[key] = action_info[key]
|
transition.info[key] = action_info[key]
|
||||||
if self.tp.agent.add_a_normalized_timestep_to_the_observation:
|
if self.tp.agent.add_a_normalized_timestep_to_the_observation:
|
||||||
transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
|
transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit
|
||||||
self.memory.store(transition)
|
self.memory.store(transition)
|
||||||
elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
|
elif phase == utils.RunPhase.TEST and self.tp.visualization.dump_gifs:
|
||||||
# we store the transitions only for saving gifs
|
# we store the transitions only for saving gifs
|
||||||
self.last_episode_images.append(self.env.get_rendered_image())
|
self.last_episode_images.append(self.env.get_rendered_image())
|
||||||
|
|
||||||
@@ -437,7 +432,7 @@ class Agent(object):
|
|||||||
self.update_log(phase=phase)
|
self.update_log(phase=phase)
|
||||||
self.log_to_screen(phase=phase)
|
self.log_to_screen(phase=phase)
|
||||||
|
|
||||||
if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
|
if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP:
|
||||||
self.reset_game()
|
self.reset_game()
|
||||||
|
|
||||||
self.current_episode += 1
|
self.current_episode += 1
|
||||||
@@ -456,8 +451,8 @@ class Agent(object):
|
|||||||
|
|
||||||
max_reward_achieved = -float('inf')
|
max_reward_achieved = -float('inf')
|
||||||
average_evaluation_reward = 0
|
average_evaluation_reward = 0
|
||||||
screen.log_title("Running evaluation")
|
logger.screen.log_title("Running evaluation")
|
||||||
self.env.change_phase(RunPhase.TEST)
|
self.env.change_phase(utils.RunPhase.TEST)
|
||||||
for i in range(num_episodes):
|
for i in range(num_episodes):
|
||||||
# keep the online network in sync with the global network
|
# keep the online network in sync with the global network
|
||||||
if keep_networks_synced:
|
if keep_networks_synced:
|
||||||
@@ -466,7 +461,7 @@ class Agent(object):
|
|||||||
|
|
||||||
episode_ended = False
|
episode_ended = False
|
||||||
while not episode_ended:
|
while not episode_ended:
|
||||||
episode_ended = self.act(phase=RunPhase.TEST)
|
episode_ended = self.act(phase=utils.RunPhase.TEST)
|
||||||
|
|
||||||
if keep_networks_synced \
|
if keep_networks_synced \
|
||||||
and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps:
|
and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps:
|
||||||
@@ -477,7 +472,7 @@ class Agent(object):
|
|||||||
max_reward_achieved = self.total_reward_in_current_episode
|
max_reward_achieved = self.total_reward_in_current_episode
|
||||||
frame_skipping = int(5/self.tp.env.frame_skip)
|
frame_skipping = int(5/self.tp.env.frame_skip)
|
||||||
if self.tp.visualization.dump_gifs:
|
if self.tp.visualization.dump_gifs:
|
||||||
logger.create_gif(self.last_episode_images[::frame_skipping],
|
logger.logger.create_gif(self.last_episode_images[::frame_skipping],
|
||||||
name='score-{}'.format(max_reward_achieved), fps=10)
|
name='score-{}'.format(max_reward_achieved), fps=10)
|
||||||
|
|
||||||
average_evaluation_reward += self.total_reward_in_current_episode
|
average_evaluation_reward += self.total_reward_in_current_episode
|
||||||
@@ -485,8 +480,8 @@ class Agent(object):
|
|||||||
|
|
||||||
average_evaluation_reward /= float(num_episodes)
|
average_evaluation_reward /= float(num_episodes)
|
||||||
|
|
||||||
self.env.change_phase(RunPhase.TRAIN)
|
self.env.change_phase(utils.RunPhase.TRAIN)
|
||||||
screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
|
logger.screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward))
|
||||||
|
|
||||||
def post_training_commands(self):
|
def post_training_commands(self):
|
||||||
pass
|
pass
|
||||||
@@ -505,15 +500,15 @@ class Agent(object):
|
|||||||
# heatup phase
|
# heatup phase
|
||||||
if self.tp.num_heatup_steps != 0:
|
if self.tp.num_heatup_steps != 0:
|
||||||
self.in_heatup = True
|
self.in_heatup = True
|
||||||
screen.log_title("Starting heatup {}".format(self.task_id))
|
logger.screen.log_title("Starting heatup {}".format(self.task_id))
|
||||||
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
|
num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
|
||||||
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
|
for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)):
|
||||||
self.act(phase=RunPhase.HEATUP)
|
self.act(phase=utils.RunPhase.HEATUP)
|
||||||
|
|
||||||
# training phase
|
# training phase
|
||||||
self.in_heatup = False
|
self.in_heatup = False
|
||||||
screen.log_title("Starting training {}".format(self.task_id))
|
logger.screen.log_title("Starting training {}".format(self.task_id))
|
||||||
self.exploration_policy.change_phase(RunPhase.TRAIN)
|
self.exploration_policy.change_phase(utils.RunPhase.TRAIN)
|
||||||
training_start_time = time.time()
|
training_start_time = time.time()
|
||||||
model_snapshots_periods_passed = -1
|
model_snapshots_periods_passed = -1
|
||||||
self.reset_game()
|
self.reset_game()
|
||||||
@@ -557,7 +552,7 @@ class Agent(object):
|
|||||||
self.loss.add_sample(loss)
|
self.loss.add_sample(loss)
|
||||||
self.training_iteration += 1
|
self.training_iteration += 1
|
||||||
if self.imitation:
|
if self.imitation:
|
||||||
self.log_to_screen(RunPhase.TRAIN)
|
self.log_to_screen(utils.RunPhase.TRAIN)
|
||||||
self.post_training_commands()
|
self.post_training_commands()
|
||||||
|
|
||||||
def save_model(self, model_id):
|
def save_model(self, model_id):
|
||||||
|
|||||||
@@ -13,16 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from agents.imitation_agent import ImitationAgent
|
from agents import imitation_agent
|
||||||
|
|
||||||
|
|
||||||
# Behavioral Cloning Agent
|
# Behavioral Cloning Agent
|
||||||
class BCAgent(ImitationAgent):
|
class BCAgent(imitation_agent.ImitationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
imitation_agent.ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
current_states, _, actions, _, _, _ = self.extract_batch(batch)
|
current_states, _, actions, _, _, _ = self.extract_batch(batch)
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
import utils
|
||||||
|
|
||||||
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
# Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf
|
||||||
class BootstrappedDQNAgent(ValueOptimizationAgent):
|
class BootstrappedDQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
|
|
||||||
def reset_game(self, do_not_reset_env=False):
|
def reset_game(self, do_not_reset_env=False):
|
||||||
ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
voa.ValueOptimizationAgent.reset_game(self, do_not_reset_env)
|
||||||
self.exploration_policy.select_head()
|
self.exploration_policy.select_head()
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
@@ -51,8 +52,8 @@ class BootstrappedDQNAgent(ValueOptimizationAgent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def act(self, phase=RunPhase.TRAIN):
|
def act(self, phase=utils.RunPhase.TRAIN):
|
||||||
ValueOptimizationAgent.act(self, phase)
|
voa.ValueOptimizationAgent.act(self, phase)
|
||||||
mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
|
mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability,
|
||||||
self.tp.exploration.architecture_num_q_heads)
|
self.tp.exploration.architecture_num_q_heads)
|
||||||
self.memory.update_last_transition_info({'mask': mask})
|
self.memory.update_last_transition_info({'mask': mask})
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||||
class CategoricalDQNAgent(ValueOptimizationAgent):
|
class CategoricalDQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
||||||
|
|
||||||
# prediction's format is (batch,actions,atoms)
|
# prediction's format is (batch,actions,atoms)
|
||||||
@@ -57,4 +58,3 @@ class CategoricalDQNAgent(ValueOptimizationAgent):
|
|||||||
total_loss = result[0]
|
total_loss = result[0]
|
||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
|
|||||||
@@ -13,27 +13,34 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
from agents.actor_critic_agent import *
|
import copy
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from agents import actor_critic_agent as aca
|
||||||
|
from agents import policy_optimization_agent as poa
|
||||||
|
import logger
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
# Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347
|
||||||
class ClippedPPOAgent(ActorCriticAgent):
|
class ClippedPPOAgent(aca.ActorCriticAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||||
create_target_network=True)
|
create_target_network=True)
|
||||||
# signals definition
|
# signals definition
|
||||||
self.value_loss = Signal('Value Loss')
|
self.value_loss = utils.Signal('Value Loss')
|
||||||
self.signals.append(self.value_loss)
|
self.signals.append(self.value_loss)
|
||||||
self.policy_loss = Signal('Policy Loss')
|
self.policy_loss = utils.Signal('Policy Loss')
|
||||||
self.signals.append(self.policy_loss)
|
self.signals.append(self.policy_loss)
|
||||||
self.total_kl_divergence_during_training_process = 0.0
|
self.total_kl_divergence_during_training_process = 0.0
|
||||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||||
self.signals.append(self.unclipped_grads)
|
self.signals.append(self.unclipped_grads)
|
||||||
self.value_targets = Signal('Value Targets')
|
self.value_targets = utils.Signal('Value Targets')
|
||||||
self.signals.append(self.value_targets)
|
self.signals.append(self.value_targets)
|
||||||
self.kl_divergence = Signal('KL Divergence')
|
self.kl_divergence = utils.Signal('KL Divergence')
|
||||||
self.signals.append(self.kl_divergence)
|
self.signals.append(self.kl_divergence)
|
||||||
|
|
||||||
def fill_advantages(self, batch):
|
def fill_advantages(self, batch):
|
||||||
@@ -46,9 +53,9 @@ class ClippedPPOAgent(ActorCriticAgent):
|
|||||||
# calculate advantages
|
# calculate advantages
|
||||||
advantages = []
|
advantages = []
|
||||||
value_targets = []
|
value_targets = []
|
||||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||||
advantages = total_return - current_state_values
|
advantages = total_return - current_state_values
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||||
# get bootstraps
|
# get bootstraps
|
||||||
episode_start_idx = 0
|
episode_start_idx = 0
|
||||||
advantages = np.array([])
|
advantages = np.array([])
|
||||||
@@ -66,7 +73,7 @@ class ClippedPPOAgent(ActorCriticAgent):
|
|||||||
advantages = np.append(advantages, rollout_advantages)
|
advantages = np.append(advantages, rollout_advantages)
|
||||||
value_targets = np.append(value_targets, gae_based_value_targets)
|
value_targets = np.append(value_targets, gae_based_value_targets)
|
||||||
else:
|
else:
|
||||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||||
|
|
||||||
# standardize
|
# standardize
|
||||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||||
@@ -144,8 +151,8 @@ class ClippedPPOAgent(ActorCriticAgent):
|
|||||||
curr_learning_rate = self.tp.learning_rate
|
curr_learning_rate = self.tp.learning_rate
|
||||||
|
|
||||||
# log training parameters
|
# log training parameters
|
||||||
screen.log_dict(
|
logger.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Surrogate loss", loss['policy_losses'][0]),
|
("Surrogate loss", loss['policy_losses'][0]),
|
||||||
("KL divergence", loss['fetch_result'][0]),
|
("KL divergence", loss['fetch_result'][0]),
|
||||||
("Entropy", loss['fetch_result'][1]),
|
("Entropy", loss['fetch_result'][1]),
|
||||||
@@ -184,13 +191,13 @@ class ClippedPPOAgent(ActorCriticAgent):
|
|||||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||||
return np.append(losses[0], losses[1])
|
return np.append(losses[0], losses[1])
|
||||||
|
|
||||||
def choose_action(self, current_state, phase=RunPhase.TRAIN):
|
def choose_action(self, current_state, phase=utils.RunPhase.TRAIN):
|
||||||
if self.env.discrete_controls:
|
if self.env.discrete_controls:
|
||||||
# DISCRETE
|
# DISCRETE
|
||||||
_, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
_, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
||||||
action_values = action_values.squeeze()
|
action_values = action_values.squeeze()
|
||||||
|
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = np.argmax(action_values)
|
action = np.argmax(action_values)
|
||||||
@@ -201,7 +208,7 @@ class ClippedPPOAgent(ActorCriticAgent):
|
|||||||
_, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
_, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state))
|
||||||
action_values_mean = action_values_mean.squeeze()
|
action_values_mean = action_values_mean.squeeze()
|
||||||
action_values_std = action_values_std.squeeze()
|
action_values_std = action_values_std.squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||||
# if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
|
# if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5:
|
||||||
# print action
|
# print action
|
||||||
|
|||||||
@@ -13,28 +13,34 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import copy
|
||||||
|
|
||||||
from agents.actor_critic_agent import *
|
import numpy as np
|
||||||
from configurations import *
|
|
||||||
|
from agents import actor_critic_agent as aca
|
||||||
|
from agents import agent
|
||||||
|
from architectures import network_wrapper as nw
|
||||||
|
import configurations as conf
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
# Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf
|
||||||
class DDPGAgent(ActorCriticAgent):
|
class DDPGAgent(aca.ActorCriticAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||||
create_target_network=True)
|
create_target_network=True)
|
||||||
# define critic network
|
# define critic network
|
||||||
self.critic_network = self.main_network
|
self.critic_network = self.main_network
|
||||||
# self.networks.append(self.critic_network)
|
# self.networks.append(self.critic_network)
|
||||||
|
|
||||||
# define actor network
|
# define actor network
|
||||||
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
|
tuning_parameters.agent.input_types = {'observation': conf.InputTypes.Observation}
|
||||||
tuning_parameters.agent.output_types = [OutputTypes.Pi]
|
tuning_parameters.agent.output_types = [conf.OutputTypes.Pi]
|
||||||
self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
|
self.actor_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'actor',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.actor_network)
|
self.networks.append(self.actor_network)
|
||||||
|
|
||||||
self.q_values = Signal("Q")
|
self.q_values = utils.Signal("Q")
|
||||||
self.signals.append(self.q_values)
|
self.signals.append(self.q_values)
|
||||||
|
|
||||||
self.reset_game(do_not_reset_env=True)
|
self.reset_game(do_not_reset_env=True)
|
||||||
@@ -82,14 +88,14 @@ class DDPGAgent(ActorCriticAgent):
|
|||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def train(self):
|
def train(self):
|
||||||
return Agent.train(self)
|
return agent.Agent.train(self)
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
|
assert not self.env.discrete_controls, 'DDPG works only for continuous control problems'
|
||||||
result = self.actor_network.online_network.predict(self.tf_input_state(curr_state))
|
result = self.actor_network.online_network.predict(self.tf_input_state(curr_state))
|
||||||
action_values = result[0].squeeze()
|
action_values = result[0].squeeze()
|
||||||
|
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = action_values
|
action = action_values
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Double DQN - https://arxiv.org/abs/1509.06461
|
# Double DQN - https://arxiv.org/abs/1509.06461
|
||||||
class DDQNAgent(ValueOptimizationAgent):
|
class DDQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||||
|
|||||||
@@ -13,16 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.agent import *
|
from agents import agent
|
||||||
|
from architectures import network_wrapper as nw
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
|
||||||
class DFPAgent(Agent):
|
class DFPAgent(agent.Agent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.current_goal = self.tp.agent.goal_vector
|
self.current_goal = self.tp.agent.goal_vector
|
||||||
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.main_network)
|
self.networks.append(self.main_network)
|
||||||
|
|
||||||
@@ -45,7 +48,7 @@ class DFPAgent(Agent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
# convert to batch so we can run it through the network
|
# convert to batch so we can run it through the network
|
||||||
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
observation = np.expand_dims(np.array(curr_state['observation']), 0)
|
||||||
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
|
measurements = np.expand_dims(np.array(curr_state['measurements']), 0)
|
||||||
@@ -66,7 +69,7 @@ class DFPAgent(Agent):
|
|||||||
self.tp.agent.future_measurements_weights)
|
self.tp.agent.future_measurements_weights)
|
||||||
|
|
||||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = np.argmax(action_values)
|
action = np.argmax(action_values)
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
# Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
|
||||||
class DistributionalDQNAgent(ValueOptimizationAgent):
|
class DistributionalDQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms)
|
||||||
|
|
||||||
# prediction's format is (batch,actions,atoms)
|
# prediction's format is (batch,actions,atoms)
|
||||||
@@ -57,4 +58,3 @@ class DistributionalDQNAgent(ValueOptimizationAgent):
|
|||||||
total_loss = result[0]
|
total_loss = result[0]
|
||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
# Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
|
||||||
class DQNAgent(ValueOptimizationAgent):
|
class DQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch)
|
||||||
|
|||||||
@@ -13,31 +13,37 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
|
||||||
from agents.agent import *
|
|
||||||
import pygame
|
import pygame
|
||||||
|
from pandas.io import pickle
|
||||||
|
|
||||||
|
from agents import agent
|
||||||
|
import logger
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class HumanAgent(Agent):
|
class HumanAgent(agent.Agent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
|
|
||||||
self.clock = pygame.time.Clock()
|
self.clock = pygame.time.Clock()
|
||||||
self.max_fps = int(self.tp.visualization.max_fps_for_human_control)
|
self.max_fps = int(self.tp.visualization.max_fps_for_human_control)
|
||||||
|
|
||||||
screen.log_title("Human Control Mode")
|
utils.screen.log_title("Human Control Mode")
|
||||||
available_keys = self.env.get_available_keys()
|
available_keys = self.env.get_available_keys()
|
||||||
if available_keys:
|
if available_keys:
|
||||||
screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
utils.screen.log("Use keyboard keys to move. Press escape to quit. Available keys:")
|
||||||
screen.log("")
|
utils.screen.log("")
|
||||||
for action, key in self.env.get_available_keys():
|
for action, key in self.env.get_available_keys():
|
||||||
screen.log("\t- {}: {}".format(action, key))
|
utils.screen.log("\t- {}: {}".format(action, key))
|
||||||
screen.separator()
|
utils.screen.separator()
|
||||||
|
|
||||||
def train(self):
|
def train(self):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
action = self.env.get_action_from_user()
|
action = self.env.get_action_from_user()
|
||||||
|
|
||||||
# keep constant fps
|
# keep constant fps
|
||||||
@@ -49,16 +55,16 @@ class HumanAgent(Agent):
|
|||||||
return action, {"action_value": 0}
|
return action, {"action_value": 0}
|
||||||
|
|
||||||
def save_replay_buffer_and_exit(self):
|
def save_replay_buffer_and_exit(self):
|
||||||
replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p')
|
replay_buffer_path = os.path.join(logger.logger.experiments_path, 'replay_buffer.p')
|
||||||
self.memory.tp = None
|
self.memory.tp = None
|
||||||
to_pickle(self.memory, replay_buffer_path)
|
pickle.to_pickle(self.memory, replay_buffer_path)
|
||||||
screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
utils.screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path))
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
def log_to_screen(self, phase):
|
def log_to_screen(self, phase):
|
||||||
# log to screen
|
# log to utils.screen
|
||||||
screen.log_dict(
|
utils.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Episode", self.current_episode),
|
("Episode", self.current_episode),
|
||||||
("total reward", self.total_reward_in_current_episode),
|
("total reward", self.total_reward_in_current_episode),
|
||||||
("steps", self.total_steps_counter)
|
("steps", self.total_steps_counter)
|
||||||
|
|||||||
@@ -13,15 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
|
|
||||||
from agents.agent import *
|
from agents import agent
|
||||||
|
from architectures import network_wrapper as nw
|
||||||
|
import utils
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
# Imitation Agent
|
# Imitation Agent
|
||||||
class ImitationAgent(Agent):
|
class ImitationAgent(agent.Agent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.main_network)
|
self.networks.append(self.main_network)
|
||||||
self.imitation = True
|
self.imitation = True
|
||||||
@@ -29,7 +33,7 @@ class ImitationAgent(Agent):
|
|||||||
def extract_action_values(self, prediction):
|
def extract_action_values(self, prediction):
|
||||||
return prediction.squeeze()
|
return prediction.squeeze()
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
# convert to batch so we can run it through the network
|
# convert to batch so we can run it through the network
|
||||||
prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
||||||
|
|
||||||
@@ -49,10 +53,10 @@ class ImitationAgent(Agent):
|
|||||||
|
|
||||||
def log_to_screen(self, phase):
|
def log_to_screen(self, phase):
|
||||||
# log to screen
|
# log to screen
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
# for the training phase - we log during the episode to visualize the progress in training
|
# for the training phase - we log during the episode to visualize the progress in training
|
||||||
screen.log_dict(
|
logging.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Worker", self.task_id),
|
("Worker", self.task_id),
|
||||||
("Episode", self.current_episode),
|
("Episode", self.current_episode),
|
||||||
("Loss", self.loss.values[-1]),
|
("Loss", self.loss.values[-1]),
|
||||||
@@ -62,4 +66,4 @@ class ImitationAgent(Agent):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# for the evaluation phase - logging as in regular RL
|
# for the evaluation phase - logging as in regular RL
|
||||||
Agent.log_to_screen(self, phase)
|
agent.Agent.log_to_screen(self, phase)
|
||||||
|
|||||||
@@ -13,13 +13,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
class MixedMonteCarloAgent(ValueOptimizationAgent):
|
class MixedMonteCarloAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
|
|||||||
@@ -14,22 +14,21 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.signal
|
|
||||||
|
|
||||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
from agents import value_optimization_agent as voa
|
||||||
from agents.policy_optimization_agent import PolicyOptimizationAgent
|
from agents import policy_optimization_agent as poa
|
||||||
from logger import logger
|
import logger
|
||||||
from utils import Signal, last_sample
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
# N Step Q Learning Agent - https://arxiv.org/abs/1602.01783
|
||||||
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
class NStepQAgent(voa.ValueOptimizationAgent, poa.PolicyOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True)
|
||||||
self.last_gradient_update_step_idx = 0
|
self.last_gradient_update_step_idx = 0
|
||||||
self.q_values = Signal('Q Values')
|
self.q_values = utils.Signal('Q Values')
|
||||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||||
self.value_loss = Signal('Value Loss')
|
self.value_loss = utils.Signal('Value Loss')
|
||||||
self.signals.append(self.q_values)
|
self.signals.append(self.q_values)
|
||||||
self.signals.append(self.unclipped_grads)
|
self.signals.append(self.unclipped_grads)
|
||||||
self.signals.append(self.value_loss)
|
self.signals.append(self.value_loss)
|
||||||
@@ -57,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
|||||||
if game_overs[-1]:
|
if game_overs[-1]:
|
||||||
R = 0
|
R = 0
|
||||||
else:
|
else:
|
||||||
R = np.max(self.main_network.target_network.predict(last_sample(next_states)))
|
R = np.max(self.main_network.target_network.predict(utils.last_sample(next_states)))
|
||||||
|
|
||||||
for i in reversed(range(num_transitions)):
|
for i in reversed(range(num_transitions)):
|
||||||
R = rewards[i] + self.tp.agent.discount * R
|
R = rewards[i] + self.tp.agent.discount * R
|
||||||
@@ -85,4 +84,4 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
|
|||||||
else:
|
else:
|
||||||
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
logger.create_signal_value('Update Target Network', 0, overwrite=False)
|
||||||
|
|
||||||
return PolicyOptimizationAgent.train(self)
|
return poa.PolicyOptimizationAgent.train(self)
|
||||||
|
|||||||
@@ -13,21 +13,20 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
from agents.value_optimization_agent import ValueOptimizationAgent
|
||||||
from utils import RunPhase, Signal
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
# Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf
|
||||||
class NAFAgent(ValueOptimizationAgent):
|
class NAFAgent(ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.l_values = Signal("L")
|
self.l_values = utils.Signal("L")
|
||||||
self.a_values = Signal("Advantage")
|
self.a_values = utils.Signal("Advantage")
|
||||||
self.mu_values = Signal("Action")
|
self.mu_values = utils.Signal("Action")
|
||||||
self.v_values = Signal("V")
|
self.v_values = utils.Signal("V")
|
||||||
self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
|
self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values]
|
||||||
|
|
||||||
def learn_from_batch(self, batch):
|
def learn_from_batch(self, batch):
|
||||||
@@ -49,7 +48,7 @@ class NAFAgent(ValueOptimizationAgent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
|
assert not self.env.discrete_controls, 'NAF works only for continuous control problems'
|
||||||
|
|
||||||
# convert to batch so we can run it through the network
|
# convert to batch so we can run it through the network
|
||||||
@@ -60,7 +59,7 @@ class NAFAgent(ValueOptimizationAgent):
|
|||||||
outputs=naf_head.mu,
|
outputs=naf_head.mu,
|
||||||
squeeze_output=False,
|
squeeze_output=False,
|
||||||
)
|
)
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = action_values
|
action = action_values
|
||||||
|
|||||||
@@ -13,18 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from agents import value_optimization_agent as voa
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from agents.value_optimization_agent import ValueOptimizationAgent
|
|
||||||
from logger import screen
|
from logger import screen
|
||||||
from utils import RunPhase
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
# Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf
|
||||||
class NECAgent(ValueOptimizationAgent):
|
class NECAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||||
create_target_network=False)
|
create_target_network=False)
|
||||||
self.current_episode_state_embeddings = []
|
self.current_episode_state_embeddings = []
|
||||||
self.training_started = False
|
self.training_started = False
|
||||||
@@ -52,7 +49,7 @@ class NECAgent(ValueOptimizationAgent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def act(self, phase=RunPhase.TRAIN):
|
def act(self, phase=utils.RunPhase.TRAIN):
|
||||||
if self.in_heatup:
|
if self.in_heatup:
|
||||||
# get embedding in heatup (otherwise we get it through choose_action)
|
# get embedding in heatup (otherwise we get it through choose_action)
|
||||||
embedding = self.main_network.online_network.predict(
|
embedding = self.main_network.online_network.predict(
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
# Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf
|
||||||
class PALAgent(ValueOptimizationAgent):
|
class PALAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.alpha = tuning_parameters.agent.pal_alpha
|
self.alpha = tuning_parameters.agent.pal_alpha
|
||||||
self.persistent = tuning_parameters.agent.persistent_advantage_learning
|
self.persistent = tuning_parameters.agent.persistent_advantage_learning
|
||||||
self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate
|
||||||
|
|||||||
@@ -13,25 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from agents.policy_optimization_agent import *
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from logger import *
|
|
||||||
import tensorflow as tf
|
|
||||||
try:
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
except:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("matplotlib")
|
|
||||||
|
|
||||||
from utils import *
|
from agents import policy_optimization_agent as poa
|
||||||
|
import logger
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class PolicyGradientsAgent(PolicyOptimizationAgent):
|
class PolicyGradientsAgent(poa.PolicyOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.returns_mean = Signal('Returns Mean')
|
self.returns_mean = utils.Signal('Returns Mean')
|
||||||
self.returns_variance = Signal('Returns Variance')
|
self.returns_variance = utils.Signal('Returns Variance')
|
||||||
self.signals.append(self.returns_mean)
|
self.signals.append(self.returns_mean)
|
||||||
self.signals.append(self.returns_variance)
|
self.signals.append(self.returns_variance)
|
||||||
self.last_gradient_update_step_idx = 0
|
self.last_gradient_update_step_idx = 0
|
||||||
@@ -41,21 +34,21 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
|||||||
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
|
current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch)
|
||||||
|
|
||||||
for i in reversed(range(len(total_returns))):
|
for i in reversed(range(len(total_returns))):
|
||||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN:
|
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.TOTAL_RETURN:
|
||||||
total_returns[i] = total_returns[0]
|
total_returns[i] = total_returns[0]
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN:
|
||||||
# just take the total return as it is
|
# just take the total return as it is
|
||||||
pass
|
pass
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE:
|
||||||
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
# we can get a single transition episode while playing Doom Basic, causing the std to be 0
|
||||||
if self.std_discounted_return != 0:
|
if self.std_discounted_return != 0:
|
||||||
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return
|
||||||
else:
|
else:
|
||||||
total_returns[i] = 0
|
total_returns[i] = 0
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP:
|
||||||
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
total_returns[i] -= self.mean_return_over_multiple_episodes[i]
|
||||||
else:
|
else:
|
||||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||||
|
|
||||||
targets = total_returns
|
targets = total_returns
|
||||||
if not self.env.discrete_controls and len(actions.shape) < 2:
|
if not self.env.discrete_controls and len(actions.shape) < 2:
|
||||||
@@ -69,12 +62,12 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
|||||||
|
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
# convert to batch so we can run it through the network
|
# convert to batch so we can run it through the network
|
||||||
if self.env.discrete_controls:
|
if self.env.discrete_controls:
|
||||||
# DISCRETE
|
# DISCRETE
|
||||||
action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = np.argmax(action_values)
|
action = np.argmax(action_values)
|
||||||
@@ -84,7 +77,7 @@ class PolicyGradientsAgent(PolicyOptimizationAgent):
|
|||||||
# CONTINUOUS
|
# CONTINUOUS
|
||||||
result = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
result = self.main_network.online_network.predict(self.tf_input_state(curr_state))
|
||||||
action_values = result[0].squeeze()
|
action_values = result[0].squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = action_values
|
action = action_values
|
||||||
|
|||||||
@@ -13,12 +13,17 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
|
|
||||||
from agents.agent import *
|
import numpy as np
|
||||||
from memories.memory import Episode
|
|
||||||
|
from agents import agent
|
||||||
|
from architectures import network_wrapper as nw
|
||||||
|
import logger
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class PolicyGradientRescaler(Enum):
|
class PolicyGradientRescaler(utils.Enum):
|
||||||
TOTAL_RETURN = 0
|
TOTAL_RETURN = 0
|
||||||
FUTURE_RETURN = 1
|
FUTURE_RETURN = 1
|
||||||
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2
|
||||||
@@ -30,10 +35,10 @@ class PolicyGradientRescaler(Enum):
|
|||||||
GAE = 8
|
GAE = 8
|
||||||
|
|
||||||
|
|
||||||
class PolicyOptimizationAgent(Agent):
|
class PolicyOptimizationAgent(agent.Agent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False):
|
||||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.main_network)
|
self.networks.append(self.main_network)
|
||||||
|
|
||||||
@@ -44,7 +49,7 @@ class PolicyOptimizationAgent(Agent):
|
|||||||
self.max_episode_length = 100000
|
self.max_episode_length = 100000
|
||||||
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length)
|
||||||
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length)
|
||||||
self.entropy = Signal('Entropy')
|
self.entropy = utils.Signal('Entropy')
|
||||||
self.signals.append(self.entropy)
|
self.signals.append(self.entropy)
|
||||||
|
|
||||||
self.reset_game(do_not_reset_env=True)
|
self.reset_game(do_not_reset_env=True)
|
||||||
@@ -52,8 +57,8 @@ class PolicyOptimizationAgent(Agent):
|
|||||||
def log_to_screen(self, phase):
|
def log_to_screen(self, phase):
|
||||||
# log to screen
|
# log to screen
|
||||||
if self.current_episode > 0:
|
if self.current_episode > 0:
|
||||||
screen.log_dict(
|
logger.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Worker", self.task_id),
|
("Worker", self.task_id),
|
||||||
("Episode", self.current_episode),
|
("Episode", self.current_episode),
|
||||||
("total reward", self.total_reward_in_current_episode),
|
("total reward", self.total_reward_in_current_episode),
|
||||||
|
|||||||
@@ -13,36 +13,44 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import collections
|
||||||
|
import copy
|
||||||
|
|
||||||
from agents.actor_critic_agent import *
|
import numpy as np
|
||||||
from random import shuffle
|
|
||||||
|
from agents import actor_critic_agent as aca
|
||||||
|
from agents import policy_optimization_agent as poa
|
||||||
|
from architectures import network_wrapper as nw
|
||||||
|
import configurations
|
||||||
|
import logger
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
|
# Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf
|
||||||
class PPOAgent(ActorCriticAgent):
|
class PPOAgent(aca.ActorCriticAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id,
|
||||||
create_target_network=True)
|
create_target_network=True)
|
||||||
self.critic_network = self.main_network
|
self.critic_network = self.main_network
|
||||||
|
|
||||||
# define the policy network
|
# define the policy network
|
||||||
tuning_parameters.agent.input_types = {'observation': InputTypes.Observation}
|
tuning_parameters.agent.input_types = {'observation': configurations.InputTypes.Observation}
|
||||||
tuning_parameters.agent.output_types = [OutputTypes.PPO]
|
tuning_parameters.agent.output_types = [configurations.OutputTypes.PPO]
|
||||||
tuning_parameters.agent.optimizer_type = 'Adam'
|
tuning_parameters.agent.optimizer_type = 'Adam'
|
||||||
tuning_parameters.agent.l2_regularization = 0
|
tuning_parameters.agent.l2_regularization = 0
|
||||||
self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
|
self.policy_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'policy',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.policy_network)
|
self.networks.append(self.policy_network)
|
||||||
|
|
||||||
# signals definition
|
# signals definition
|
||||||
self.value_loss = Signal('Value Loss')
|
self.value_loss = utils.Signal('Value Loss')
|
||||||
self.signals.append(self.value_loss)
|
self.signals.append(self.value_loss)
|
||||||
self.policy_loss = Signal('Policy Loss')
|
self.policy_loss = utils.Signal('Policy Loss')
|
||||||
self.signals.append(self.policy_loss)
|
self.signals.append(self.policy_loss)
|
||||||
self.kl_divergence = Signal('KL Divergence')
|
self.kl_divergence = utils.Signal('KL Divergence')
|
||||||
self.signals.append(self.kl_divergence)
|
self.signals.append(self.kl_divergence)
|
||||||
self.total_kl_divergence_during_training_process = 0.0
|
self.total_kl_divergence_during_training_process = 0.0
|
||||||
self.unclipped_grads = Signal('Grads (unclipped)')
|
self.unclipped_grads = utils.Signal('Grads (unclipped)')
|
||||||
self.signals.append(self.unclipped_grads)
|
self.signals.append(self.unclipped_grads)
|
||||||
|
|
||||||
self.reset_game(do_not_reset_env=True)
|
self.reset_game(do_not_reset_env=True)
|
||||||
@@ -57,9 +65,9 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
|
|
||||||
# calculate advantages
|
# calculate advantages
|
||||||
advantages = []
|
advantages = []
|
||||||
if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE:
|
if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE:
|
||||||
advantages = total_return - current_state_values
|
advantages = total_return - current_state_values
|
||||||
elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE:
|
elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE:
|
||||||
# get bootstraps
|
# get bootstraps
|
||||||
episode_start_idx = 0
|
episode_start_idx = 0
|
||||||
advantages = np.array([])
|
advantages = np.array([])
|
||||||
@@ -76,7 +84,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
episode_start_idx = idx + 1
|
episode_start_idx = idx + 1
|
||||||
advantages = np.append(advantages, rollout_advantages)
|
advantages = np.append(advantages, rollout_advantages)
|
||||||
else:
|
else:
|
||||||
screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
logger.screen.warning("WARNING: The requested policy gradient rescaler is not available")
|
||||||
|
|
||||||
# standardize
|
# standardize
|
||||||
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
advantages = (advantages - np.mean(advantages)) / np.std(advantages)
|
||||||
@@ -107,7 +115,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
for k, v in current_states.items()
|
for k, v in current_states.items()
|
||||||
}
|
}
|
||||||
total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
|
total_return_batch = total_return[i * batch_size:(i + 1) * batch_size]
|
||||||
old_policy_values = force_list(self.critic_network.target_network.predict(
|
old_policy_values = utils.force_list(self.critic_network.target_network.predict(
|
||||||
current_states_batch).squeeze())
|
current_states_batch).squeeze())
|
||||||
if self.critic_network.online_network.optimizer_type != 'LBFGS':
|
if self.critic_network.online_network.optimizer_type != 'LBFGS':
|
||||||
targets = total_return_batch
|
targets = total_return_batch
|
||||||
@@ -155,7 +163,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
actions = np.expand_dims(actions, -1)
|
actions = np.expand_dims(actions, -1)
|
||||||
|
|
||||||
# get old policy probabilities and distribution
|
# get old policy probabilities and distribution
|
||||||
old_policy = force_list(self.policy_network.target_network.predict(current_states))
|
old_policy = utils.force_list(self.policy_network.target_network.predict(current_states))
|
||||||
|
|
||||||
# calculate gradients and apply on both the local policy network and on the global policy network
|
# calculate gradients and apply on both the local policy network and on the global policy network
|
||||||
fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
|
fetches = [self.policy_network.online_network.output_heads[0].kl_divergence,
|
||||||
@@ -196,8 +204,8 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
curr_learning_rate = self.tp.learning_rate
|
curr_learning_rate = self.tp.learning_rate
|
||||||
|
|
||||||
# log training parameters
|
# log training parameters
|
||||||
screen.log_dict(
|
logger.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Surrogate loss", loss['policy_losses'][0]),
|
("Surrogate loss", loss['policy_losses'][0]),
|
||||||
("KL divergence", loss['fetch_result'][0]),
|
("KL divergence", loss['fetch_result'][0]),
|
||||||
("Entropy", loss['fetch_result'][1]),
|
("Entropy", loss['fetch_result'][1]),
|
||||||
@@ -215,7 +223,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
def update_kl_coefficient(self):
|
def update_kl_coefficient(self):
|
||||||
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
# John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow
|
||||||
# his implementation for now because we know it works well
|
# his implementation for now because we know it works well
|
||||||
screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
logger.screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process))
|
||||||
|
|
||||||
# update kl coefficient
|
# update kl coefficient
|
||||||
kl_target = self.tp.agent.target_kl_divergence
|
kl_target = self.tp.agent.target_kl_divergence
|
||||||
@@ -236,7 +244,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
new_kl_coefficient,
|
new_kl_coefficient,
|
||||||
self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
|
self.policy_network.online_network.output_heads[0].kl_coefficient_ph)
|
||||||
|
|
||||||
screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
logger.screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient))
|
||||||
|
|
||||||
def post_training_commands(self):
|
def post_training_commands(self):
|
||||||
if self.tp.agent.use_kl_regularization:
|
if self.tp.agent.use_kl_regularization:
|
||||||
@@ -264,12 +272,12 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
self.update_log() # should be done in order to update the data that has been accumulated * while not playing *
|
||||||
return np.append(value_loss, policy_loss)
|
return np.append(value_loss, policy_loss)
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
if self.env.discrete_controls:
|
if self.env.discrete_controls:
|
||||||
# DISCRETE
|
# DISCRETE
|
||||||
action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze()
|
||||||
|
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = self.exploration_policy.get_action(action_values)
|
action = self.exploration_policy.get_action(action_values)
|
||||||
else:
|
else:
|
||||||
action = np.argmax(action_values)
|
action = np.argmax(action_values)
|
||||||
@@ -280,7 +288,7 @@ class PPOAgent(ActorCriticAgent):
|
|||||||
action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state))
|
action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state))
|
||||||
action_values_mean = action_values_mean.squeeze()
|
action_values_mean = action_values_mean.squeeze()
|
||||||
action_values_std = action_values_std.squeeze()
|
action_values_std = action_values_std.squeeze()
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean)
|
||||||
else:
|
else:
|
||||||
action = action_values_mean
|
action = action_values_mean
|
||||||
|
|||||||
@@ -13,14 +13,15 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from agents.value_optimization_agent import *
|
from agents import value_optimization_agent as voa
|
||||||
|
|
||||||
|
|
||||||
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
|
# Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf
|
||||||
class QuantileRegressionDQNAgent(ValueOptimizationAgent):
|
class QuantileRegressionDQNAgent(voa.ValueOptimizationAgent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0):
|
||||||
ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms)
|
self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms)
|
||||||
|
|
||||||
# prediction's format is (batch,actions,atoms)
|
# prediction's format is (batch,actions,atoms)
|
||||||
|
|||||||
@@ -13,21 +13,20 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from agents.agent import Agent
|
from agents import agent
|
||||||
from architectures.network_wrapper import NetworkWrapper
|
from architectures import network_wrapper as nw
|
||||||
from utils import RunPhase, Signal
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class ValueOptimizationAgent(Agent):
|
class ValueOptimizationAgent(agent.Agent):
|
||||||
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
|
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True):
|
||||||
Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id)
|
||||||
self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main',
|
||||||
self.replicated_device, self.worker_device)
|
self.replicated_device, self.worker_device)
|
||||||
self.networks.append(self.main_network)
|
self.networks.append(self.main_network)
|
||||||
self.q_values = Signal("Q")
|
self.q_values = utils.Signal("Q")
|
||||||
self.signals.append(self.q_values)
|
self.signals.append(self.q_values)
|
||||||
|
|
||||||
self.reset_game(do_not_reset_env=True)
|
self.reset_game(do_not_reset_env=True)
|
||||||
@@ -47,12 +46,12 @@ class ValueOptimizationAgent(Agent):
|
|||||||
'require exploration policies which return a single action.'
|
'require exploration policies which return a single action.'
|
||||||
).format(policy.__class__.__name__))
|
).format(policy.__class__.__name__))
|
||||||
|
|
||||||
def choose_action(self, curr_state, phase=RunPhase.TRAIN):
|
def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN):
|
||||||
prediction = self.get_prediction(curr_state)
|
prediction = self.get_prediction(curr_state)
|
||||||
actions_q_values = self.get_q_values(prediction)
|
actions_q_values = self.get_q_values(prediction)
|
||||||
|
|
||||||
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
# choose action according to the exploration policy and the current phase (evaluating or training the agent)
|
||||||
if phase == RunPhase.TRAIN:
|
if phase == utils.RunPhase.TRAIN:
|
||||||
exploration_policy = self.exploration_policy
|
exploration_policy = self.exploration_policy
|
||||||
else:
|
else:
|
||||||
exploration_policy = self.evaluation_exploration_policy
|
exploration_policy = self.evaluation_exploration_policy
|
||||||
|
|||||||
@@ -13,19 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import logger
|
||||||
from architectures.architecture import *
|
|
||||||
from logger import failed_imports
|
|
||||||
try:
|
|
||||||
from architectures.tensorflow_components.general_network import *
|
|
||||||
from architectures.tensorflow_components.architecture import *
|
|
||||||
except ImportError:
|
|
||||||
failed_imports.append("TensorFlow")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from architectures.neon_components.general_network import *
|
from architectures.tensorflow_components import general_network as ts_gn
|
||||||
from architectures.neon_components.architecture import *
|
from architectures.tensorflow_components import architecture as ts_arch
|
||||||
except ImportError:
|
except ImportError:
|
||||||
failed_imports.append("Neon")
|
logger.failed_imports.append("TensorFlow")
|
||||||
|
|
||||||
from architectures.network_wrapper import *
|
try:
|
||||||
|
from architectures.neon_components import general_network as neon_gn
|
||||||
|
from architectures.neon_components import architecture as neon_arch
|
||||||
|
except ImportError:
|
||||||
|
logger.failed_imports.append("Neon")
|
||||||
|
|||||||
@@ -14,8 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from configurations import Preset
|
|
||||||
|
|
||||||
|
|
||||||
class Architecture(object):
|
class Architecture(object):
|
||||||
def __init__(self, tuning_parameters, name=""):
|
def __init__(self, tuning_parameters, name=""):
|
||||||
|
|||||||
@@ -13,19 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys
|
|
||||||
import copy
|
|
||||||
from ngraph.frontends.neon import *
|
|
||||||
import ngraph as ng
|
import ngraph as ng
|
||||||
from architectures.architecture import *
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from utils import *
|
|
||||||
|
from architectures import architecture
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class NeonArchitecture(Architecture):
|
class NeonArchitecture(architecture.Architecture):
|
||||||
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
||||||
Architecture.__init__(self, tuning_parameters, name)
|
architecture.Architecture.__init__(self, tuning_parameters, name)
|
||||||
assert tuning_parameters.agent.neon_support, 'Neon is not supported for this agent'
|
assert tuning_parameters.agent.neon_support, 'Neon is not supported for this agent'
|
||||||
self.clip_error = tuning_parameters.clip_gradients
|
self.clip_error = tuning_parameters.clip_gradients
|
||||||
self.total_loss = None
|
self.total_loss = None
|
||||||
@@ -113,8 +110,8 @@ class NeonArchitecture(Architecture):
|
|||||||
def accumulate_gradients(self, inputs, targets):
|
def accumulate_gradients(self, inputs, targets):
|
||||||
# Neon doesn't currently allow separating the grads calculation and grad apply operations
|
# Neon doesn't currently allow separating the grads calculation and grad apply operations
|
||||||
# so this feature is not currently available. instead we do a full training iteration
|
# so this feature is not currently available. instead we do a full training iteration
|
||||||
inputs = force_list(inputs)
|
inputs = utils.force_list(inputs)
|
||||||
targets = force_list(targets)
|
targets = utils.force_list(targets)
|
||||||
|
|
||||||
for idx, input in enumerate(inputs):
|
for idx, input in enumerate(inputs):
|
||||||
inputs[idx] = input.swapaxes(0, -1)
|
inputs[idx] = input.swapaxes(0, -1)
|
||||||
|
|||||||
@@ -13,10 +13,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import ngraph.frontends.neon as neon
|
|
||||||
import ngraph as ng
|
import ngraph as ng
|
||||||
from ngraph.util.names import name_scope
|
import ngraph.frontends.neon as neon
|
||||||
|
import ngraph.util.names as ngraph_names
|
||||||
|
|
||||||
|
|
||||||
class InputEmbedder(object):
|
class InputEmbedder(object):
|
||||||
@@ -31,7 +30,7 @@ class InputEmbedder(object):
|
|||||||
self.output = None
|
self.output = None
|
||||||
|
|
||||||
def __call__(self, prev_input_placeholder=None):
|
def __call__(self, prev_input_placeholder=None):
|
||||||
with name_scope(self.get_name()):
|
with ngraph_names.name_scope(self.get_name()):
|
||||||
# create the input axes
|
# create the input axes
|
||||||
axes = []
|
axes = []
|
||||||
if len(self.input_size) == 2:
|
if len(self.input_size) == 2:
|
||||||
|
|||||||
@@ -13,15 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import ngraph as ng
|
||||||
|
from ngraph.frontends import neon
|
||||||
|
from ngraph.util import names as ngraph_names
|
||||||
|
|
||||||
from architectures.neon_components.embedders import *
|
from architectures.neon_components import architecture
|
||||||
from architectures.neon_components.heads import *
|
from architectures.neon_components import embedders
|
||||||
from architectures.neon_components.middleware import *
|
from architectures.neon_components import middleware
|
||||||
from architectures.neon_components.architecture import *
|
from architectures.neon_components import heads
|
||||||
from configurations import InputTypes, OutputTypes, MiddlewareTypes
|
import configurations as conf
|
||||||
|
|
||||||
|
|
||||||
class GeneralNeonNetwork(NeonArchitecture):
|
class GeneralNeonNetwork(architecture.NeonArchitecture):
|
||||||
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
||||||
self.global_network = global_network
|
self.global_network = global_network
|
||||||
self.network_is_local = network_is_local
|
self.network_is_local = network_is_local
|
||||||
@@ -34,7 +37,7 @@ class GeneralNeonNetwork(NeonArchitecture):
|
|||||||
self.activation_function = self.get_activation_function(
|
self.activation_function = self.get_activation_function(
|
||||||
tuning_parameters.agent.hidden_layers_activation_function)
|
tuning_parameters.agent.hidden_layers_activation_function)
|
||||||
|
|
||||||
NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local)
|
architecture.NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local)
|
||||||
|
|
||||||
def get_activation_function(self, activation_function_string):
|
def get_activation_function(self, activation_function_string):
|
||||||
activation_functions = {
|
activation_functions = {
|
||||||
@@ -53,36 +56,36 @@ class GeneralNeonNetwork(NeonArchitecture):
|
|||||||
# the observation can be either an image or a vector
|
# the observation can be either an image or a vector
|
||||||
def get_observation_embedding(with_timestep=False):
|
def get_observation_embedding(with_timestep=False):
|
||||||
if self.input_height > 1:
|
if self.input_height > 1:
|
||||||
return ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size,
|
return embedders.ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size,
|
||||||
name="observation")
|
name="observation")
|
||||||
else:
|
else:
|
||||||
return VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size,
|
return embedders.VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size,
|
||||||
name="observation")
|
name="observation")
|
||||||
|
|
||||||
input_mapping = {
|
input_mapping = {
|
||||||
InputTypes.Observation: get_observation_embedding(),
|
conf.InputTypes.Observation: get_observation_embedding(),
|
||||||
InputTypes.Measurements: VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"),
|
conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"),
|
||||||
InputTypes.GoalVector: VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"),
|
conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"),
|
||||||
InputTypes.Action: VectorEmbedder((self.num_actions,), self.batch_size, name="action"),
|
conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), self.batch_size, name="action"),
|
||||||
InputTypes.TimedObservation: get_observation_embedding(with_timestep=True),
|
conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True),
|
||||||
}
|
}
|
||||||
return input_mapping[embedder_type]
|
return input_mapping[embedder_type]
|
||||||
|
|
||||||
def get_middleware_embedder(self, middleware_type):
|
def get_middleware_embedder(self, middleware_type):
|
||||||
return {MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach
|
return {conf.MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach
|
||||||
MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function)
|
conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function)
|
||||||
|
|
||||||
def get_output_head(self, head_type, head_idx, loss_weight=1.):
|
def get_output_head(self, head_type, head_idx, loss_weight=1.):
|
||||||
output_mapping = {
|
output_mapping = {
|
||||||
OutputTypes.Q: QHead,
|
conf.OutputTypes.Q: heads.QHead,
|
||||||
OutputTypes.DuelingQ: DuelingQHead,
|
conf.OutputTypes.DuelingQ: heads.DuelingQHead,
|
||||||
OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach
|
conf.OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach
|
||||||
OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach
|
conf.OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach
|
||||||
OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach
|
conf.OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach
|
||||||
OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach
|
conf.OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach
|
||||||
OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach
|
conf.OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach
|
||||||
OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach
|
conf.OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach
|
||||||
OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach
|
conf.OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach
|
||||||
}
|
}
|
||||||
return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local)
|
return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local)
|
||||||
|
|
||||||
@@ -104,7 +107,7 @@ class GeneralNeonNetwork(NeonArchitecture):
|
|||||||
done_creating_input_placeholders = False
|
done_creating_input_placeholders = False
|
||||||
|
|
||||||
for network_idx in range(self.num_networks):
|
for network_idx in range(self.num_networks):
|
||||||
with name_scope('network_{}'.format(network_idx)):
|
with ngraph_names.name_scope('network_{}'.format(network_idx)):
|
||||||
####################
|
####################
|
||||||
# Input Embeddings #
|
# Input Embeddings #
|
||||||
####################
|
####################
|
||||||
|
|||||||
@@ -13,13 +13,12 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import ngraph as ng
|
import ngraph as ng
|
||||||
from ngraph.util.names import name_scope
|
from ngraph.frontends import neon
|
||||||
import ngraph.frontends.neon as neon
|
from ngraph.util import names as ngraph_names
|
||||||
import numpy as np
|
|
||||||
from utils import force_list
|
import utils
|
||||||
from architectures.neon_components.losses import *
|
from architectures.neon_components import losses
|
||||||
|
|
||||||
|
|
||||||
class Head(object):
|
class Head(object):
|
||||||
@@ -30,7 +29,7 @@ class Head(object):
|
|||||||
self.loss = []
|
self.loss = []
|
||||||
self.loss_type = []
|
self.loss_type = []
|
||||||
self.regularizations = []
|
self.regularizations = []
|
||||||
self.loss_weight = force_list(loss_weight)
|
self.loss_weight = utils.force_list(loss_weight)
|
||||||
self.weights_init = neon.GlorotInit()
|
self.weights_init = neon.GlorotInit()
|
||||||
self.biases_init = neon.ConstantInit()
|
self.biases_init = neon.ConstantInit()
|
||||||
self.target = []
|
self.target = []
|
||||||
@@ -44,15 +43,15 @@ class Head(object):
|
|||||||
:param input_layer: the input to the graph
|
:param input_layer: the input to the graph
|
||||||
:return: the output of the last layer and the target placeholder
|
:return: the output of the last layer and the target placeholder
|
||||||
"""
|
"""
|
||||||
with name_scope(self.get_name()):
|
with ngraph_names.name_scope(self.get_name()):
|
||||||
self._build_module(input_layer)
|
self._build_module(input_layer)
|
||||||
|
|
||||||
self.output = force_list(self.output)
|
self.output = utils.force_list(self.output)
|
||||||
self.target = force_list(self.target)
|
self.target = utils.force_list(self.target)
|
||||||
self.input = force_list(self.input)
|
self.input = utils.force_list(self.input)
|
||||||
self.loss_type = force_list(self.loss_type)
|
self.loss_type = utils.force_list(self.loss_type)
|
||||||
self.loss = force_list(self.loss)
|
self.loss = utils.force_list(self.loss)
|
||||||
self.regularizations = force_list(self.regularizations)
|
self.regularizations = utils.force_list(self.regularizations)
|
||||||
if self.is_local:
|
if self.is_local:
|
||||||
self.set_loss()
|
self.set_loss()
|
||||||
|
|
||||||
@@ -106,7 +105,7 @@ class QHead(Head):
|
|||||||
if tuning_parameters.agent.replace_mse_with_huber_loss:
|
if tuning_parameters.agent.replace_mse_with_huber_loss:
|
||||||
raise Exception("huber loss is not supported in neon")
|
raise Exception("huber loss is not supported in neon")
|
||||||
else:
|
else:
|
||||||
self.loss_type = mean_squared_error
|
self.loss_type = losses.mean_squared_error
|
||||||
|
|
||||||
def _build_module(self, input_layer):
|
def _build_module(self, input_layer):
|
||||||
# Standard Q Network
|
# Standard Q Network
|
||||||
@@ -159,7 +158,7 @@ class MeasurementsPredictionHead(Head):
|
|||||||
if tuning_parameters.agent.replace_mse_with_huber_loss:
|
if tuning_parameters.agent.replace_mse_with_huber_loss:
|
||||||
raise Exception("huber loss is not supported in neon")
|
raise Exception("huber loss is not supported in neon")
|
||||||
else:
|
else:
|
||||||
self.loss_type = mean_squared_error
|
self.loss_type = losses.mean_squared_error
|
||||||
|
|
||||||
def _build_module(self, input_layer):
|
def _build_module(self, input_layer):
|
||||||
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
|
# This is almost exactly the same as Dueling Network but we predict the future measurements for each action
|
||||||
@@ -167,7 +166,7 @@ class MeasurementsPredictionHead(Head):
|
|||||||
multistep_measurements_size = self.measurements_size[0] * self.num_predicted_steps_ahead
|
multistep_measurements_size = self.measurements_size[0] * self.num_predicted_steps_ahead
|
||||||
|
|
||||||
# actions expectation tower (expectation stream) - E
|
# actions expectation tower (expectation stream) - E
|
||||||
with name_scope("expectation_stream"):
|
with ngraph_names.name_scope("expectation_stream"):
|
||||||
expectation_stream = neon.Sequential([
|
expectation_stream = neon.Sequential([
|
||||||
neon.Affine(nout=256, activation=neon.Rectlin(),
|
neon.Affine(nout=256, activation=neon.Rectlin(),
|
||||||
weight_init=self.weights_init, bias_init=self.biases_init),
|
weight_init=self.weights_init, bias_init=self.biases_init),
|
||||||
@@ -176,7 +175,7 @@ class MeasurementsPredictionHead(Head):
|
|||||||
])(input_layer)
|
])(input_layer)
|
||||||
|
|
||||||
# action fine differences tower (action stream) - A
|
# action fine differences tower (action stream) - A
|
||||||
with name_scope("action_stream"):
|
with ngraph_names.name_scope("action_stream"):
|
||||||
action_stream_unnormalized = neon.Sequential([
|
action_stream_unnormalized = neon.Sequential([
|
||||||
neon.Affine(nout=256, activation=neon.Rectlin(),
|
neon.Affine(nout=256, activation=neon.Rectlin(),
|
||||||
weight_init=self.weights_init, bias_init=self.biases_init),
|
weight_init=self.weights_init, bias_init=self.biases_init),
|
||||||
@@ -191,4 +190,3 @@ class MeasurementsPredictionHead(Head):
|
|||||||
|
|
||||||
# merge to future measurements predictions
|
# merge to future measurements predictions
|
||||||
self.output = repeated_expectation_stream + action_stream
|
self.output = repeated_expectation_stream + action_stream
|
||||||
|
|
||||||
|
|||||||
@@ -13,15 +13,12 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import ngraph as ng
|
import ngraph as ng
|
||||||
import ngraph.frontends.neon as neon
|
from ngraph.util import names as ngraph_names
|
||||||
from ngraph.util.names import name_scope
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def mean_squared_error(targets, outputs, weights=1.0, scope=""):
|
def mean_squared_error(targets, outputs, weights=1.0, scope=""):
|
||||||
with name_scope(scope):
|
with ngraph_names.name_scope(scope):
|
||||||
# TODO: reduce mean over the action axis
|
# TODO: reduce mean over the action axis
|
||||||
loss = ng.squared_L2(targets - outputs)
|
loss = ng.squared_L2(targets - outputs)
|
||||||
weighted_loss = loss * weights
|
weighted_loss = loss * weights
|
||||||
|
|||||||
@@ -13,11 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import ngraph as ng
|
|
||||||
import ngraph.frontends.neon as neon
|
import ngraph.frontends.neon as neon
|
||||||
from ngraph.util.names import name_scope
|
from ngraph.util import names as ngraph_names
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class MiddlewareEmbedder(object):
|
class MiddlewareEmbedder(object):
|
||||||
@@ -30,7 +27,7 @@ class MiddlewareEmbedder(object):
|
|||||||
self.activation_function = activation_function
|
self.activation_function = activation_function
|
||||||
|
|
||||||
def __call__(self, input_layer):
|
def __call__(self, input_layer):
|
||||||
with name_scope(self.get_name()):
|
with ngraph_names.name_scope(self.get_name()):
|
||||||
self.input = input_layer
|
self.input = input_layer
|
||||||
self._build_module()
|
self._build_module()
|
||||||
|
|
||||||
|
|||||||
@@ -13,20 +13,21 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
|
import collections
|
||||||
|
|
||||||
from collections import OrderedDict
|
import configurations as conf
|
||||||
from configurations import Preset, Frameworks
|
import logger
|
||||||
from logger import *
|
|
||||||
try:
|
try:
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
from architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork
|
from architectures.tensorflow_components import general_network as tf_net #import GeneralTensorFlowNetwork
|
||||||
except ImportError:
|
except ImportError:
|
||||||
failed_imports.append("TensorFlow")
|
logger.failed_imports.append("TensorFlow")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from architectures.neon_components.general_network import GeneralNeonNetwork
|
from architectures.neon_components import general_network as neon_net
|
||||||
except ImportError:
|
except ImportError:
|
||||||
failed_imports.append("Neon")
|
logger.failed_imports.append("Neon")
|
||||||
|
|
||||||
|
|
||||||
class NetworkWrapper(object):
|
class NetworkWrapper(object):
|
||||||
@@ -50,12 +51,12 @@ class NetworkWrapper(object):
|
|||||||
self.name = name
|
self.name = name
|
||||||
self.sess = tuning_parameters.sess
|
self.sess = tuning_parameters.sess
|
||||||
|
|
||||||
if self.tp.framework == Frameworks.TensorFlow:
|
if self.tp.framework == conf.Frameworks.TensorFlow:
|
||||||
general_network = GeneralTensorFlowNetwork
|
general_network = tf_net.GeneralTensorFlowNetwork
|
||||||
elif self.tp.framework == Frameworks.Neon:
|
elif self.tp.framework == conf.Frameworks.Neon:
|
||||||
general_network = GeneralNeonNetwork
|
general_network = neon_net.GeneralNeonNetwork
|
||||||
else:
|
else:
|
||||||
raise Exception("{} Framework is not supported".format(Frameworks().to_string(self.tp.framework)))
|
raise Exception("{} Framework is not supported".format(conf.Frameworks().to_string(self.tp.framework)))
|
||||||
|
|
||||||
# Global network - the main network shared between threads
|
# Global network - the main network shared between threads
|
||||||
self.global_network = None
|
self.global_network = None
|
||||||
@@ -77,13 +78,13 @@ class NetworkWrapper(object):
|
|||||||
self.target_network = general_network(tuning_parameters, '{}/target'.format(name),
|
self.target_network = general_network(tuning_parameters, '{}/target'.format(name),
|
||||||
network_is_local=True)
|
network_is_local=True)
|
||||||
|
|
||||||
if not self.tp.distributed and self.tp.framework == Frameworks.TensorFlow:
|
if not self.tp.distributed and self.tp.framework == conf.Frameworks.TensorFlow:
|
||||||
variables_to_restore = tf.global_variables()
|
variables_to_restore = tf.global_variables()
|
||||||
variables_to_restore = [v for v in variables_to_restore if '/online' in v.name]
|
variables_to_restore = [v for v in variables_to_restore if '/online' in v.name]
|
||||||
self.model_saver = tf.train.Saver(variables_to_restore)
|
self.model_saver = tf.train.Saver(variables_to_restore)
|
||||||
if self.tp.sess and self.tp.checkpoint_restore_dir:
|
if self.tp.sess and self.tp.checkpoint_restore_dir:
|
||||||
checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir)
|
checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir)
|
||||||
screen.log_title("Loading checkpoint: {}".format(checkpoint))
|
logger.screen.log_title("Loading checkpoint: {}".format(checkpoint))
|
||||||
self.model_saver.restore(self.tp.sess, checkpoint)
|
self.model_saver.restore(self.tp.sess, checkpoint)
|
||||||
self.update_target_network()
|
self.update_target_network()
|
||||||
|
|
||||||
@@ -178,8 +179,8 @@ class NetworkWrapper(object):
|
|||||||
def save_model(self, model_id):
|
def save_model(self, model_id):
|
||||||
saved_model_path = self.model_saver.save(self.tp.sess, os.path.join(self.tp.save_model_dir,
|
saved_model_path = self.model_saver.save(self.tp.sess, os.path.join(self.tp.save_model_dir,
|
||||||
str(model_id) + '.ckpt'))
|
str(model_id) + '.ckpt'))
|
||||||
screen.log_dict(
|
logger.screen.log_dict(
|
||||||
OrderedDict([
|
collections.OrderedDict([
|
||||||
("Saving model", saved_model_path),
|
("Saving model", saved_model_path),
|
||||||
]),
|
]),
|
||||||
prefix="Checkpoint"
|
prefix="Checkpoint"
|
||||||
|
|||||||
@@ -15,12 +15,11 @@
|
|||||||
#
|
#
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from architectures.architecture import Architecture
|
from architectures import architecture
|
||||||
from utils import force_list, squeeze_list
|
import configurations as conf
|
||||||
from configurations import Preset, MiddlewareTypes
|
import utils
|
||||||
|
|
||||||
def variable_summaries(var):
|
def variable_summaries(var):
|
||||||
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
|
"""Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
|
||||||
@@ -37,14 +36,14 @@ def variable_summaries(var):
|
|||||||
tf.summary.scalar('min', tf.reduce_min(var))
|
tf.summary.scalar('min', tf.reduce_min(var))
|
||||||
tf.summary.histogram('histogram', var)
|
tf.summary.histogram('histogram', var)
|
||||||
|
|
||||||
class TensorFlowArchitecture(Architecture):
|
class TensorFlowArchitecture(architecture.Architecture):
|
||||||
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: The parameters used for running the algorithm
|
:param tuning_parameters: The parameters used for running the algorithm
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
:param name: The name of the network
|
:param name: The name of the network
|
||||||
"""
|
"""
|
||||||
Architecture.__init__(self, tuning_parameters, name)
|
architecture.Architecture.__init__(self, tuning_parameters, name)
|
||||||
self.middleware_embedder = None
|
self.middleware_embedder = None
|
||||||
self.network_is_local = network_is_local
|
self.network_is_local = network_is_local
|
||||||
assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent'
|
assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent'
|
||||||
@@ -174,7 +173,7 @@ class TensorFlowArchitecture(Architecture):
|
|||||||
feed_dict = self._feed_dict(inputs)
|
feed_dict = self._feed_dict(inputs)
|
||||||
|
|
||||||
# feed targets
|
# feed targets
|
||||||
targets = force_list(targets)
|
targets = utils.force_list(targets)
|
||||||
for placeholder_idx, target in enumerate(targets):
|
for placeholder_idx, target in enumerate(targets):
|
||||||
feed_dict[self.targets[placeholder_idx]] = target
|
feed_dict[self.targets[placeholder_idx]] = target
|
||||||
|
|
||||||
@@ -186,13 +185,13 @@ class TensorFlowArchitecture(Architecture):
|
|||||||
else:
|
else:
|
||||||
fetches.append(self.tensor_gradients)
|
fetches.append(self.tensor_gradients)
|
||||||
fetches += [self.total_loss, self.losses]
|
fetches += [self.total_loss, self.losses]
|
||||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM:
|
||||||
fetches.append(self.middleware_embedder.state_out)
|
fetches.append(self.middleware_embedder.state_out)
|
||||||
additional_fetches_start_idx = len(fetches)
|
additional_fetches_start_idx = len(fetches)
|
||||||
fetches += additional_fetches
|
fetches += additional_fetches
|
||||||
|
|
||||||
# feed the lstm state if necessary
|
# feed the lstm state if necessary
|
||||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM:
|
||||||
# we can't always assume that we are starting from scratch here can we?
|
# we can't always assume that we are starting from scratch here can we?
|
||||||
feed_dict[self.middleware_embedder.c_in] = self.middleware_embedder.c_init
|
feed_dict[self.middleware_embedder.c_in] = self.middleware_embedder.c_init
|
||||||
feed_dict[self.middleware_embedder.h_in] = self.middleware_embedder.h_init
|
feed_dict[self.middleware_embedder.h_in] = self.middleware_embedder.h_init
|
||||||
@@ -206,7 +205,7 @@ class TensorFlowArchitecture(Architecture):
|
|||||||
|
|
||||||
# extract the fetches
|
# extract the fetches
|
||||||
norm_unclipped_grads, grads, total_loss, losses = result[:4]
|
norm_unclipped_grads, grads, total_loss, losses = result[:4]
|
||||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM:
|
||||||
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
|
(self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4]
|
||||||
fetched_tensors = []
|
fetched_tensors = []
|
||||||
if len(additional_fetches) > 0:
|
if len(additional_fetches) > 0:
|
||||||
@@ -308,7 +307,7 @@ class TensorFlowArchitecture(Architecture):
|
|||||||
if outputs is None:
|
if outputs is None:
|
||||||
outputs = self.outputs
|
outputs = self.outputs
|
||||||
|
|
||||||
if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
|
if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM:
|
||||||
feed_dict[self.middleware_embedder.c_in] = self.curr_rnn_c_in
|
feed_dict[self.middleware_embedder.c_in] = self.curr_rnn_c_in
|
||||||
feed_dict[self.middleware_embedder.h_in] = self.curr_rnn_h_in
|
feed_dict[self.middleware_embedder.h_in] = self.curr_rnn_h_in
|
||||||
|
|
||||||
@@ -317,7 +316,7 @@ class TensorFlowArchitecture(Architecture):
|
|||||||
output = self.tp.sess.run(outputs, feed_dict)
|
output = self.tp.sess.run(outputs, feed_dict)
|
||||||
|
|
||||||
if squeeze_output:
|
if squeeze_output:
|
||||||
output = squeeze_list(output)
|
output = utils.squeeze_list(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|||||||
@@ -13,8 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
from configurations import EmbedderComplexity
|
from configurations import EmbedderComplexity
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,15 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import tensorflow as tf
|
||||||
|
|
||||||
from architectures.tensorflow_components.embedders import *
|
from architectures.tensorflow_components import architecture
|
||||||
from architectures.tensorflow_components.heads import *
|
from architectures.tensorflow_components import embedders
|
||||||
from architectures.tensorflow_components.middleware import *
|
from architectures.tensorflow_components import middleware
|
||||||
from architectures.tensorflow_components.architecture import *
|
from architectures.tensorflow_components import heads
|
||||||
from configurations import InputTypes, OutputTypes, MiddlewareTypes
|
import configurations as conf
|
||||||
|
|
||||||
|
|
||||||
class GeneralTensorFlowNetwork(TensorFlowArchitecture):
|
class GeneralTensorFlowNetwork(architecture.TensorFlowArchitecture):
|
||||||
"""
|
"""
|
||||||
A generalized version of all possible networks implemented using tensorflow.
|
A generalized version of all possible networks implemented using tensorflow.
|
||||||
"""
|
"""
|
||||||
@@ -37,7 +38,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
|
|||||||
self.activation_function = self.get_activation_function(
|
self.activation_function = self.get_activation_function(
|
||||||
tuning_parameters.agent.hidden_layers_activation_function)
|
tuning_parameters.agent.hidden_layers_activation_function)
|
||||||
|
|
||||||
TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local)
|
architecture.TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local)
|
||||||
|
|
||||||
def get_activation_function(self, activation_function_string):
|
def get_activation_function(self, activation_function_string):
|
||||||
activation_functions = {
|
activation_functions = {
|
||||||
@@ -56,37 +57,37 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture):
|
|||||||
# the observation can be either an image or a vector
|
# the observation can be either an image or a vector
|
||||||
def get_observation_embedding(with_timestep=False):
|
def get_observation_embedding(with_timestep=False):
|
||||||
if self.input_height > 1:
|
if self.input_height > 1:
|
||||||
return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation",
|
return embedders.ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation",
|
||||||
input_rescaler=self.tp.agent.input_rescaler)
|
input_rescaler=self.tp.agent.input_rescaler)
|
||||||
else:
|
else:
|
||||||
return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation")
|
return embedders.VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation")
|
||||||
|
|
||||||
input_mapping = {
|
input_mapping = {
|
||||||
InputTypes.Observation: get_observation_embedding(),
|
conf.InputTypes.Observation: get_observation_embedding(),
|
||||||
InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements"),
|
conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, name="measurements"),
|
||||||
InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector"),
|
conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, name="goal_vector"),
|
||||||
InputTypes.Action: VectorEmbedder((self.num_actions,), name="action"),
|
conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), name="action"),
|
||||||
InputTypes.TimedObservation: get_observation_embedding(with_timestep=True),
|
conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True),
|
||||||
}
|
}
|
||||||
return input_mapping[embedder_type]
|
return input_mapping[embedder_type]
|
||||||
|
|
||||||
def get_middleware_embedder(self, middleware_type):
|
def get_middleware_embedder(self, middleware_type):
|
||||||
return {MiddlewareTypes.LSTM: LSTM_Embedder,
|
return {conf.MiddlewareTypes.LSTM: middleware.LSTM_Embedder,
|
||||||
MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function)
|
conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function)
|
||||||
|
|
||||||
def get_output_head(self, head_type, head_idx, loss_weight=1.):
|
def get_output_head(self, head_type, head_idx, loss_weight=1.):
|
||||||
output_mapping = {
|
output_mapping = {
|
||||||
OutputTypes.Q: QHead,
|
conf.OutputTypes.Q: heads.QHead,
|
||||||
OutputTypes.DuelingQ: DuelingQHead,
|
conf.OutputTypes.DuelingQ: heads.DuelingQHead,
|
||||||
OutputTypes.V: VHead,
|
conf.OutputTypes.V: heads.VHead,
|
||||||
OutputTypes.Pi: PolicyHead,
|
conf.OutputTypes.Pi: heads.PolicyHead,
|
||||||
OutputTypes.MeasurementsPrediction: MeasurementsPredictionHead,
|
conf.OutputTypes.MeasurementsPrediction: heads.MeasurementsPredictionHead,
|
||||||
OutputTypes.DNDQ: DNDQHead,
|
conf.OutputTypes.DNDQ: heads.DNDQHead,
|
||||||
OutputTypes.NAF: NAFHead,
|
conf.OutputTypes.NAF: heads.NAFHead,
|
||||||
OutputTypes.PPO: PPOHead,
|
conf.OutputTypes.PPO: heads.PPOHead,
|
||||||
OutputTypes.PPO_V: PPOVHead,
|
conf.OutputTypes.PPO_V: heads.PPOVHead,
|
||||||
OutputTypes.CategoricalQ: CategoricalQHead,
|
conf.OutputTypes.CategoricalQ: heads.CategoricalQHead,
|
||||||
OutputTypes.QuantileRegressionQ: QuantileRegressionQHead
|
conf.OutputTypes.QuantileRegressionQ: heads.QuantileRegressionQHead
|
||||||
}
|
}
|
||||||
return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local)
|
return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local)
|
||||||
|
|
||||||
|
|||||||
@@ -13,10 +13,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from utils import force_list
|
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# Used to initialize weights for policy and value output layers
|
# Used to initialize weights for policy and value output layers
|
||||||
@@ -36,7 +36,7 @@ class Head(object):
|
|||||||
self.loss = []
|
self.loss = []
|
||||||
self.loss_type = []
|
self.loss_type = []
|
||||||
self.regularizations = []
|
self.regularizations = []
|
||||||
self.loss_weight = force_list(loss_weight)
|
self.loss_weight = utils.force_list(loss_weight)
|
||||||
self.target = []
|
self.target = []
|
||||||
self.input = []
|
self.input = []
|
||||||
self.is_local = is_local
|
self.is_local = is_local
|
||||||
@@ -50,12 +50,12 @@ class Head(object):
|
|||||||
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
|
with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()):
|
||||||
self._build_module(input_layer)
|
self._build_module(input_layer)
|
||||||
|
|
||||||
self.output = force_list(self.output)
|
self.output = utils.force_list(self.output)
|
||||||
self.target = force_list(self.target)
|
self.target = utils.force_list(self.target)
|
||||||
self.input = force_list(self.input)
|
self.input = utils.force_list(self.input)
|
||||||
self.loss_type = force_list(self.loss_type)
|
self.loss_type = utils.force_list(self.loss_type)
|
||||||
self.loss = force_list(self.loss)
|
self.loss = utils.force_list(self.loss)
|
||||||
self.regularizations = force_list(self.regularizations)
|
self.regularizations = utils.force_list(self.regularizations)
|
||||||
if self.is_local:
|
if self.is_local:
|
||||||
self.set_loss()
|
self.set_loss()
|
||||||
self._post_build()
|
self._post_build()
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|||||||
95
coach.py
95
coach.py
@@ -13,46 +13,42 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys, inspect, re
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import presets
|
|
||||||
from presets import *
|
|
||||||
from utils import set_gpu, list_all_classes_in_module
|
|
||||||
from architectures import *
|
|
||||||
from environments import *
|
|
||||||
from agents import *
|
|
||||||
from utils import *
|
|
||||||
from logger import screen, logger
|
|
||||||
import argparse
|
|
||||||
from subprocess import Popen
|
|
||||||
import datetime
|
|
||||||
import presets
|
|
||||||
import atexit
|
import atexit
|
||||||
import sys
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from threading import Thread
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
if len(set(failed_imports)) > 0:
|
import agents
|
||||||
screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports))))
|
import argparse
|
||||||
|
import configurations as conf
|
||||||
|
import environments
|
||||||
|
import logger
|
||||||
|
import presets
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
|
if len(set(logger.failed_imports)) > 0:
|
||||||
|
logger.screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(logger.failed_imports))))
|
||||||
|
|
||||||
|
|
||||||
def set_framework(framework_type):
|
def set_framework(framework_type):
|
||||||
# choosing neural network framework
|
# choosing neural network framework
|
||||||
framework = Frameworks().get(framework_type)
|
framework = conf.Frameworks().get(framework_type)
|
||||||
sess = None
|
sess = None
|
||||||
if framework == Frameworks.TensorFlow:
|
if framework == conf.Frameworks.TensorFlow:
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
config = tf.ConfigProto()
|
config = tf.ConfigProto()
|
||||||
config.allow_soft_placement = True
|
config.allow_soft_placement = True
|
||||||
config.gpu_options.allow_growth = True
|
config.gpu_options.allow_growth = True
|
||||||
config.gpu_options.per_process_gpu_memory_fraction = 0.2
|
config.gpu_options.per_process_gpu_memory_fraction = 0.2
|
||||||
sess = tf.Session(config=config)
|
sess = tf.Session(config=config)
|
||||||
elif framework == Frameworks.Neon:
|
elif framework == conf.Frameworks.Neon:
|
||||||
import ngraph as ng
|
import ngraph as ng
|
||||||
sess = ng.transformers.make_transformer()
|
sess = ng.transformers.make_transformer()
|
||||||
screen.log_title("Using {} framework".format(Frameworks().to_string(framework)))
|
logger.screen.log_title("Using {} framework".format(conf.Frameworks().to_string(framework)))
|
||||||
return sess
|
return sess
|
||||||
|
|
||||||
|
|
||||||
@@ -66,8 +62,8 @@ def check_input_and_fill_run_dict(parser):
|
|||||||
|
|
||||||
# list available presets
|
# list available presets
|
||||||
if args.list:
|
if args.list:
|
||||||
presets_lists = list_all_classes_in_module(presets)
|
presets_lists = utils.list_all_classes_in_module(presets)
|
||||||
screen.log_title("Available Presets:")
|
logger.screen.log_title("Available Presets:")
|
||||||
for preset in presets_lists:
|
for preset in presets_lists:
|
||||||
print(preset)
|
print(preset)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
@@ -77,28 +73,28 @@ def check_input_and_fill_run_dict(parser):
|
|||||||
# num_workers = int(args.num_workers)
|
# num_workers = int(args.num_workers)
|
||||||
num_workers = int(re.match("^\d+$", args.num_workers).group(0))
|
num_workers = int(re.match("^\d+$", args.num_workers).group(0))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
screen.error("Parameter num_workers should be an integer.")
|
logger.screen.error("Parameter num_workers should be an integer.")
|
||||||
|
|
||||||
preset_names = list_all_classes_in_module(presets)
|
preset_names = utils.list_all_classes_in_module(presets)
|
||||||
if args.preset is not None and args.preset not in preset_names:
|
if args.preset is not None and args.preset not in preset_names:
|
||||||
screen.error("A non-existing preset was selected. ")
|
logger.screen.error("A non-existing preset was selected. ")
|
||||||
|
|
||||||
if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
|
if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir):
|
||||||
screen.error("The requested checkpoint folder to load from does not exist. ")
|
logger.screen.error("The requested checkpoint folder to load from does not exist. ")
|
||||||
|
|
||||||
if args.save_model_sec is not None:
|
if args.save_model_sec is not None:
|
||||||
try:
|
try:
|
||||||
args.save_model_sec = int(args.save_model_sec)
|
args.save_model_sec = int(args.save_model_sec)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
screen.error("Parameter save_model_sec should be an integer.")
|
logger.screen.error("Parameter save_model_sec should be an integer.")
|
||||||
|
|
||||||
if args.preset is None and (args.agent_type is None or args.environment_type is None
|
if args.preset is None and (args.agent_type is None or args.environment_type is None
|
||||||
or args.exploration_policy_type is None) and not args.play:
|
or args.exploration_policy_type is None) and not args.play:
|
||||||
screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,'
|
logger.screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,'
|
||||||
' environment_type and exploration_policy_type to assemble a preset. '
|
' environment_type and exploration_policy_type to assemble a preset. '
|
||||||
'\nAt least one of these parameters was not given.')
|
'\nAt least one of these parameters was not given.')
|
||||||
elif args.preset is None and args.play and args.environment_type is None:
|
elif args.preset is None and args.play and args.environment_type is None:
|
||||||
screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,'
|
logger.screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,'
|
||||||
' the user is expected to input the desired environment_type and level.'
|
' the user is expected to input the desired environment_type and level.'
|
||||||
'\nAt least one of these parameters was not given.')
|
'\nAt least one of these parameters was not given.')
|
||||||
elif args.preset is None and args.play and args.environment_type:
|
elif args.preset is None and args.play and args.environment_type:
|
||||||
@@ -106,11 +102,11 @@ def check_input_and_fill_run_dict(parser):
|
|||||||
args.exploration_policy_type = 'ExplorationParameters'
|
args.exploration_policy_type = 'ExplorationParameters'
|
||||||
|
|
||||||
# get experiment name and path
|
# get experiment name and path
|
||||||
experiment_name = logger.get_experiment_name(args.experiment_name)
|
experiment_name = logger.logger.get_experiment_name(args.experiment_name)
|
||||||
experiment_path = logger.get_experiment_path(experiment_name)
|
experiment_path = logger.logger.get_experiment_path(experiment_name)
|
||||||
|
|
||||||
if args.play and num_workers > 1:
|
if args.play and num_workers > 1:
|
||||||
screen.warning("Playing the game as a human is only available with a single worker. "
|
logger.screen.warning("Playing the game as a human is only available with a single worker. "
|
||||||
"The number of workers will be reduced to 1")
|
"The number of workers will be reduced to 1")
|
||||||
num_workers = 1
|
num_workers = 1
|
||||||
|
|
||||||
@@ -123,7 +119,7 @@ def check_input_and_fill_run_dict(parser):
|
|||||||
run_dict['preset'] = args.preset
|
run_dict['preset'] = args.preset
|
||||||
run_dict['custom_parameter'] = args.custom_parameter
|
run_dict['custom_parameter'] = args.custom_parameter
|
||||||
run_dict['experiment_path'] = experiment_path
|
run_dict['experiment_path'] = experiment_path
|
||||||
run_dict['framework'] = Frameworks().get(args.framework)
|
run_dict['framework'] = conf.Frameworks().get(args.framework)
|
||||||
run_dict['play'] = args.play
|
run_dict['play'] = args.play
|
||||||
run_dict['evaluate'] = args.evaluate# or args.play
|
run_dict['evaluate'] = args.evaluate# or args.play
|
||||||
|
|
||||||
@@ -251,16 +247,16 @@ if __name__ == "__main__":
|
|||||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||||
|
|
||||||
# dump documentation
|
# dump documentation
|
||||||
logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True)
|
logger.logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True)
|
||||||
if not args.no_summary:
|
if not args.no_summary:
|
||||||
atexit.register(logger.summarize_experiment)
|
atexit.register(logger.logger.summarize_experiment)
|
||||||
screen.change_terminal_title(logger.experiment_name)
|
logger.screen.change_terminal_title(logger.logger.experiment_name)
|
||||||
|
|
||||||
# Single-threaded runs
|
# Single-threaded runs
|
||||||
if run_dict['num_threads'] == 1:
|
if run_dict['num_threads'] == 1:
|
||||||
# set tuning parameters
|
# set tuning parameters
|
||||||
json_run_dict_path = run_dict_to_json(run_dict)
|
json_run_dict_path = run_dict_to_json(run_dict)
|
||||||
tuning_parameters = json_to_preset(json_run_dict_path)
|
tuning_parameters = presets.json_to_preset(json_run_dict_path)
|
||||||
tuning_parameters.sess = set_framework(args.framework)
|
tuning_parameters.sess = set_framework(args.framework)
|
||||||
|
|
||||||
if args.print_parameters:
|
if args.print_parameters:
|
||||||
@@ -268,8 +264,9 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
# Single-thread runs
|
# Single-thread runs
|
||||||
tuning_parameters.task_index = 0
|
tuning_parameters.task_index = 0
|
||||||
env_instance = create_environment(tuning_parameters)
|
env_instance = environments.create_environment(tuning_parameters)
|
||||||
agent = eval(tuning_parameters.agent.type + '(env_instance, tuning_parameters)')
|
agent = eval('agents.' + tuning_parameters.agent.type +
|
||||||
|
'(env_instance, tuning_parameters)')
|
||||||
|
|
||||||
# Start the training or evaluation
|
# Start the training or evaluation
|
||||||
if tuning_parameters.evaluate:
|
if tuning_parameters.evaluate:
|
||||||
@@ -282,11 +279,11 @@ if __name__ == "__main__":
|
|||||||
assert args.framework.lower() == 'tensorflow', "Distributed training works only with TensorFlow"
|
assert args.framework.lower() == 'tensorflow', "Distributed training works only with TensorFlow"
|
||||||
os.environ["OMP_NUM_THREADS"]="1"
|
os.environ["OMP_NUM_THREADS"]="1"
|
||||||
# set parameter server and workers addresses
|
# set parameter server and workers addresses
|
||||||
ps_hosts = "localhost:{}".format(get_open_port())
|
ps_hosts = "localhost:{}".format(utils.get_open_port())
|
||||||
worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(run_dict['num_threads'] + 1)])
|
worker_hosts = ",".join(["localhost:{}".format(utils.get_open_port()) for i in range(run_dict['num_threads'] + 1)])
|
||||||
|
|
||||||
# Make sure to disable GPU so that all the workers will use the CPU
|
# Make sure to disable GPU so that all the workers will use the CPU
|
||||||
set_cpu()
|
utils.set_cpu()
|
||||||
|
|
||||||
# create a parameter server
|
# create a parameter server
|
||||||
cmd = [
|
cmd = [
|
||||||
@@ -296,9 +293,9 @@ if __name__ == "__main__":
|
|||||||
"--worker_hosts={}".format(worker_hosts),
|
"--worker_hosts={}".format(worker_hosts),
|
||||||
"--job_name=ps",
|
"--job_name=ps",
|
||||||
]
|
]
|
||||||
parameter_server = Popen(cmd)
|
parameter_server = subprocess.Popen(cmd)
|
||||||
|
|
||||||
screen.log_title("*** Distributed Training ***")
|
logger.screen.log_title("*** Distributed Training ***")
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# create N training workers and 1 evaluating worker
|
# create N training workers and 1 evaluating worker
|
||||||
@@ -321,7 +318,7 @@ if __name__ == "__main__":
|
|||||||
"--job_name=worker",
|
"--job_name=worker",
|
||||||
"--load_json={}".format(json_run_dict_path)]
|
"--load_json={}".format(json_run_dict_path)]
|
||||||
|
|
||||||
p = Popen(workers_args)
|
p = subprocess.Popen(workers_args)
|
||||||
|
|
||||||
if i != run_dict['num_threads']:
|
if i != run_dict['num_threads']:
|
||||||
workers.append(p)
|
workers.append(p)
|
||||||
|
|||||||
@@ -13,13 +13,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from utils import Enum
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import types
|
import types
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class Frameworks(Enum):
|
class Frameworks(utils.Enum):
|
||||||
TensorFlow = 1
|
TensorFlow = 1
|
||||||
Neon = 2
|
Neon = 2
|
||||||
|
|
||||||
|
|||||||
193
dashboard.py
193
dashboard.py
@@ -19,29 +19,24 @@ To run Coach Dashboard, run the following command:
|
|||||||
python3 dashboard.py
|
python3 dashboard.py
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from utils import *
|
|
||||||
import os
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import wx
|
|
||||||
import random
|
|
||||||
import pandas as pd
|
|
||||||
from pandas.io.common import EmptyDataError
|
|
||||||
import numpy as np
|
|
||||||
import colorsys
|
import colorsys
|
||||||
from bokeh.palettes import Dark2
|
import datetime
|
||||||
from bokeh.layouts import row, column, widgetbox, Spacer
|
import enum
|
||||||
from bokeh.models import ColumnDataSource, Range1d, LinearAxis, HoverTool, WheelZoomTool, PanTool, Legend
|
import itertools
|
||||||
from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup
|
import os
|
||||||
from bokeh.models.glyphs import Patch
|
import random
|
||||||
from bokeh.plotting import figure, show, curdoc
|
|
||||||
from utils import force_list
|
from bokeh import palettes
|
||||||
from utils import squeeze_list
|
from bokeh import layouts as bl
|
||||||
from itertools import cycle
|
from bokeh import models as bm
|
||||||
from os import listdir
|
from bokeh.models import widgets as bw
|
||||||
from os.path import isfile, join, isdir, basename
|
from bokeh import plotting as bp
|
||||||
from enum import Enum
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from pandas.io import pandas_common
|
||||||
|
import wx
|
||||||
|
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class DialogApp(wx.App):
|
class DialogApp(wx.App):
|
||||||
@@ -67,7 +62,7 @@ class Signal:
|
|||||||
self.name = name
|
self.name = name
|
||||||
self.full_name = "{}/{}".format(parent.filename, self.name)
|
self.full_name = "{}/{}".format(parent.filename, self.name)
|
||||||
self.selected = False
|
self.selected = False
|
||||||
self.color = random.choice(Dark2[8])
|
self.color = random.choice(palettes.Dark2[8])
|
||||||
self.line = None
|
self.line = None
|
||||||
self.bands = None
|
self.bands = None
|
||||||
self.bokeh_source = parent.bokeh_source
|
self.bokeh_source = parent.bokeh_source
|
||||||
@@ -79,12 +74,12 @@ class Signal:
|
|||||||
if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name:
|
if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name:
|
||||||
self.sub_signals.append(name)
|
self.sub_signals.append(name)
|
||||||
if len(self.sub_signals) > 1:
|
if len(self.sub_signals) > 1:
|
||||||
self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]])
|
self.mean_signal = utils.squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]])
|
||||||
self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]])
|
self.stdev_signal = utils.squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]])
|
||||||
self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]])
|
self.min_signal = utils.squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]])
|
||||||
self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]])
|
self.max_signal = utils.squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]])
|
||||||
else:
|
else:
|
||||||
self.mean_signal = squeeze_list(self.name)
|
self.mean_signal = utils.squeeze_list(self.name)
|
||||||
self.stdev_signal = None
|
self.stdev_signal = None
|
||||||
self.min_signal = None
|
self.min_signal = None
|
||||||
self.max_signal = None
|
self.max_signal = None
|
||||||
@@ -107,16 +102,16 @@ class Signal:
|
|||||||
if self.selected != val:
|
if self.selected != val:
|
||||||
self.selected = val
|
self.selected = val
|
||||||
if self.line:
|
if self.line:
|
||||||
# self.set_color(Dark2[8][current_color])
|
# self.set_color(palettes.Dark2[8][current_color])
|
||||||
# current_color = (current_color + 1) % len(Dark2[8])
|
# current_color = (current_color + 1) % len(palettes.Dark2[8])
|
||||||
self.line.visible = self.selected
|
self.line.visible = self.selected
|
||||||
if self.bands:
|
if self.bands:
|
||||||
self.bands.visible = self.selected and self.show_bollinger_bands
|
self.bands.visible = self.selected and self.show_bollinger_bands
|
||||||
elif self.selected:
|
elif self.selected:
|
||||||
# lazy plotting - plot only when selected for the first time
|
# lazy plotting - plot only when selected for the first time
|
||||||
show_spinner()
|
show_spinner()
|
||||||
self.set_color(Dark2[8][current_color])
|
self.set_color(palettes.Dark2[8][current_color])
|
||||||
current_color = (current_color + 1) % len(Dark2[8])
|
current_color = (current_color + 1) % len(palettes.Dark2[8])
|
||||||
if self.has_bollinger_bands:
|
if self.has_bollinger_bands:
|
||||||
self.set_bands_source()
|
self.set_bands_source()
|
||||||
self.create_bands()
|
self.create_bands()
|
||||||
@@ -149,7 +144,7 @@ class Signal:
|
|||||||
if self.bollinger_bands_source:
|
if self.bollinger_bands_source:
|
||||||
self.bollinger_bands_source.data = source_data
|
self.bollinger_bands_source.data = source_data
|
||||||
else:
|
else:
|
||||||
self.bollinger_bands_source = ColumnDataSource(source_data)
|
self.bollinger_bands_source = bm.ColumnDataSource(source_data)
|
||||||
|
|
||||||
def change_bollinger_bands_state(self, new_state):
|
def change_bollinger_bands_state(self, new_state):
|
||||||
self.show_bollinger_bands = new_state
|
self.show_bollinger_bands = new_state
|
||||||
@@ -192,11 +187,11 @@ class SignalsFileBase:
|
|||||||
|
|
||||||
def update_source_and_signals(self):
|
def update_source_and_signals(self):
|
||||||
# create bokeh data sources
|
# create bokeh data sources
|
||||||
self.bokeh_source_orig = ColumnDataSource(self.csv)
|
self.bokeh_source_orig = bm.ColumnDataSource(self.csv)
|
||||||
self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis]
|
self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis]
|
||||||
|
|
||||||
if self.bokeh_source is None:
|
if self.bokeh_source is None:
|
||||||
self.bokeh_source = ColumnDataSource(self.csv)
|
self.bokeh_source = bm.ColumnDataSource(self.csv)
|
||||||
else:
|
else:
|
||||||
# self.bokeh_source.data = self.bokeh_source_orig.data
|
# self.bokeh_source.data = self.bokeh_source_orig.data
|
||||||
# smooth the data if necessary
|
# smooth the data if necessary
|
||||||
@@ -282,7 +277,7 @@ class SignalsFile(SignalsFileBase):
|
|||||||
def __init__(self, csv_path, load=True):
|
def __init__(self, csv_path, load=True):
|
||||||
SignalsFileBase.__init__(self)
|
SignalsFileBase.__init__(self)
|
||||||
self.full_csv_path = csv_path
|
self.full_csv_path = csv_path
|
||||||
self.dir, self.filename, _ = break_file_path(csv_path)
|
self.dir, self.filename, _ = utils.break_file_path(csv_path)
|
||||||
if load:
|
if load:
|
||||||
self.load()
|
self.load()
|
||||||
# this helps set the correct x axis
|
# this helps set the correct x axis
|
||||||
@@ -296,7 +291,7 @@ class SignalsFile(SignalsFileBase):
|
|||||||
try:
|
try:
|
||||||
self.csv = pd.read_csv(self.full_csv_path)
|
self.csv = pd.read_csv(self.full_csv_path)
|
||||||
break
|
break
|
||||||
except EmptyDataError:
|
except pandas_common.EmptyDataError:
|
||||||
self.csv = None
|
self.csv = None
|
||||||
continue
|
continue
|
||||||
self.csv = self.csv.interpolate()
|
self.csv = self.csv.interpolate()
|
||||||
@@ -327,7 +322,7 @@ class SignalsFilesGroup(SignalsFileBase):
|
|||||||
else:
|
else:
|
||||||
# get the common directory for all the experiments
|
# get the common directory for all the experiments
|
||||||
self.dir = os.path.dirname(os.path.commonprefix(csv_paths))
|
self.dir = os.path.dirname(os.path.commonprefix(csv_paths))
|
||||||
self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files))
|
self.filename = '{} - Group({})'.format(os.path.basename(self.dir), len(self.signals_files))
|
||||||
self.load()
|
self.load()
|
||||||
|
|
||||||
# this helps set the correct x axis
|
# this helps set the correct x axis
|
||||||
@@ -425,7 +420,7 @@ class SignalsFilesGroup(SignalsFileBase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class RunType(Enum):
|
class RunType(enum.Enum):
|
||||||
SINGLE_FOLDER_SINGLE_FILE = 1
|
SINGLE_FOLDER_SINGLE_FILE = 1
|
||||||
SINGLE_FOLDER_MULTIPLE_FILES = 2
|
SINGLE_FOLDER_MULTIPLE_FILES = 2
|
||||||
MULTIPLE_FOLDERS_SINGLE_FILES = 3
|
MULTIPLE_FOLDERS_SINGLE_FILES = 3
|
||||||
@@ -433,7 +428,7 @@ class RunType(Enum):
|
|||||||
UNKNOWN = 0
|
UNKNOWN = 0
|
||||||
|
|
||||||
|
|
||||||
class FolderType(Enum):
|
class FolderType(enum.Enum):
|
||||||
SINGLE_FILE = 1
|
SINGLE_FILE = 1
|
||||||
MULTIPLE_FILES = 2
|
MULTIPLE_FILES = 2
|
||||||
MULTIPLE_FOLDERS = 3
|
MULTIPLE_FOLDERS = 3
|
||||||
@@ -454,24 +449,24 @@ root_dir = os.path.dirname(os.path.abspath(__file__))
|
|||||||
with open(os.path.join(root_dir, 'spinner.css'), 'r') as f:
|
with open(os.path.join(root_dir, 'spinner.css'), 'r') as f:
|
||||||
spinner_style = """<style>{}</style>""".format(f.read())
|
spinner_style = """<style>{}</style>""".format(f.read())
|
||||||
spinner_html = """<ul class="spinner"><li></li><li></li><li></li><li></li></ul>"""
|
spinner_html = """<ul class="spinner"><li></li><li></li><li></li><li></li></ul>"""
|
||||||
spinner = Div(text="""""")
|
spinner = bw.Div(text="""""")
|
||||||
|
|
||||||
# file refresh time placeholder
|
# file refresh time placeholder
|
||||||
refresh_info = Div(text="""""", width=210)
|
refresh_info = bw.Div(text="""""", width=210)
|
||||||
|
|
||||||
# create figures
|
# create figures
|
||||||
plot = figure(plot_width=1200, plot_height=800,
|
plot = bp.figure(plot_width=1200, plot_height=800,
|
||||||
tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
|
tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save',
|
||||||
toolbar_location='above', x_axis_label='Episodes',
|
toolbar_location='above', x_axis_label='Episodes',
|
||||||
x_range=Range1d(0, 10000), y_range=Range1d(0, 100000))
|
x_range=bm.Range1d(0, 10000), y_range=bm.Range1d(0, 100000))
|
||||||
plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)}
|
plot.extra_y_ranges = {"secondary": bm.Range1d(start=-100, end=200)}
|
||||||
plot.add_layout(LinearAxis(y_range_name="secondary"), 'right')
|
plot.add_layout(bm.LinearAxis(y_range_name="secondary"), 'right')
|
||||||
|
|
||||||
# legend
|
# legend
|
||||||
div = Div(text="""""")
|
div = bw.Div(text="""""")
|
||||||
legend = widgetbox([div])
|
legend = bl.widgetbox([div])
|
||||||
|
|
||||||
bokeh_legend = Legend(
|
bokeh_legend = bm.Legend(
|
||||||
items=[("12345678901234567890123456789012345678901234567890", [])], # 50 letters
|
items=[("12345678901234567890123456789012345678901234567890", [])], # 50 letters
|
||||||
# items=[(" ", [])], # 50 letters
|
# items=[(" ", [])], # 50 letters
|
||||||
location=(-20, 0), orientation="vertical",
|
location=(-20, 0), orientation="vertical",
|
||||||
@@ -605,8 +600,8 @@ def load_files_group():
|
|||||||
|
|
||||||
# classify the folder as containing a single file, multiple files or only folders
|
# classify the folder as containing a single file, multiple files or only folders
|
||||||
def classify_folder(dir_path):
|
def classify_folder(dir_path):
|
||||||
files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')]
|
files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.csv')]
|
||||||
folders = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
|
folders = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))]
|
||||||
if len(files) == 1:
|
if len(files) == 1:
|
||||||
return FolderType.SINGLE_FILE
|
return FolderType.SINGLE_FILE
|
||||||
elif len(files) > 1:
|
elif len(files) > 1:
|
||||||
@@ -628,7 +623,7 @@ def get_run_type(dir_path):
|
|||||||
|
|
||||||
elif folder_type == FolderType.MULTIPLE_FOLDERS:
|
elif folder_type == FolderType.MULTIPLE_FOLDERS:
|
||||||
# folder contains sub dirs -> we assume we can classify the folder using only the first sub dir
|
# folder contains sub dirs -> we assume we can classify the folder using only the first sub dir
|
||||||
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
|
sub_dirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))]
|
||||||
|
|
||||||
# checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the
|
# checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the
|
||||||
# same structure (i.e. if one is a result of multi-threaded run, so will all the other).
|
# same structure (i.e. if one is a result of multi-threaded run, so will all the other).
|
||||||
@@ -645,12 +640,12 @@ def add_directory_csv_files(dir_path, paths=None):
|
|||||||
if not paths:
|
if not paths:
|
||||||
paths = []
|
paths = []
|
||||||
|
|
||||||
for p in listdir(dir_path):
|
for p in os.listdir(dir_path):
|
||||||
path = join(dir_path, p)
|
path = os.path.join(dir_path, p)
|
||||||
if isdir(path):
|
if os.path.isdir(path):
|
||||||
# call recursively for each dir
|
# call recursively for each dir
|
||||||
paths = add_directory_csv_files(path, paths)
|
paths = add_directory_csv_files(path, paths)
|
||||||
elif isfile(path) and path.endswith('.csv'):
|
elif os.path.isfile(path) and path.endswith('.csv'):
|
||||||
# add every file to the list
|
# add every file to the list
|
||||||
paths.append(path)
|
paths.append(path)
|
||||||
|
|
||||||
@@ -667,7 +662,7 @@ def handle_dir(dir_path, run_type):
|
|||||||
elif run_type == RunType.MULTIPLE_FOLDERS_SINGLE_FILES:
|
elif run_type == RunType.MULTIPLE_FOLDERS_SINGLE_FILES:
|
||||||
create_files_group_signal(paths)
|
create_files_group_signal(paths)
|
||||||
elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES:
|
elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES:
|
||||||
sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))]
|
sub_dirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))]
|
||||||
# for d in sub_dirs:
|
# for d in sub_dirs:
|
||||||
# paths = add_directory_csv_files(os.path.join(dir_path, d))
|
# paths = add_directory_csv_files(os.path.join(dir_path, d))
|
||||||
# create_files_group_signal(paths)
|
# create_files_group_signal(paths)
|
||||||
@@ -731,7 +726,7 @@ def unload_file():
|
|||||||
selected_file.hide_all_signals()
|
selected_file.hide_all_signals()
|
||||||
del signals_files[selected_file.filename]
|
del signals_files[selected_file.filename]
|
||||||
data_selector.options = [""]
|
data_selector.options = [""]
|
||||||
filenames = cycle(files_selector.options)
|
filenames = itertools.cycle(files_selector.options)
|
||||||
files_selector.options.remove(selected_file.filename)
|
files_selector.options.remove(selected_file.filename)
|
||||||
if len(files_selector.options) > 0:
|
if len(files_selector.options) > 0:
|
||||||
files_selector.value = next(filenames)
|
files_selector.value = next(filenames)
|
||||||
@@ -869,46 +864,46 @@ crcolor, crRGBs = generate_color_range(color_resolution, brightness) # produce
|
|||||||
# ---------------- Build Website Layout -------------------
|
# ---------------- Build Website Layout -------------------
|
||||||
|
|
||||||
# select file
|
# select file
|
||||||
file_selection_button = Button(label="Select Files", button_type="success", width=120)
|
file_selection_button = bw.Button(label="Select Files", button_type="success", width=120)
|
||||||
file_selection_button.on_click(load_files_group)
|
file_selection_button.on_click(load_files_group)
|
||||||
|
|
||||||
files_selector_spacer = Spacer(width=10)
|
files_selector_spacer = bl.Spacer(width=10)
|
||||||
|
|
||||||
group_selection_button = Button(label="Select Directory", button_type="primary", width=140)
|
group_selection_button = bw.Button(label="Select Directory", button_type="primary", width=140)
|
||||||
group_selection_button.on_click(load_directory_group)
|
group_selection_button.on_click(load_directory_group)
|
||||||
|
|
||||||
unload_file_button = Button(label="Unload", button_type="danger", width=50)
|
unload_file_button = bw.Button(label="Unload", button_type="danger", width=50)
|
||||||
unload_file_button.on_click(unload_file)
|
unload_file_button.on_click(unload_file)
|
||||||
|
|
||||||
# files selection box
|
# files selection box
|
||||||
files_selector = Select(title="Files:", options=[], width=200)
|
files_selector = bw.Select(title="Files:", options=[], width=200)
|
||||||
files_selector.on_change('value', change_data_selector)
|
files_selector.on_change('value', change_data_selector)
|
||||||
|
|
||||||
# data selection box
|
# data selection box
|
||||||
data_selector = MultiSelect(title="Data:", options=[], size=12)
|
data_selector = bw.MultiSelect(title="Data:", options=[], size=12)
|
||||||
data_selector.on_change('value', select_data)
|
data_selector.on_change('value', select_data)
|
||||||
|
|
||||||
# x axis selection box
|
# x axis selection box
|
||||||
x_axis_selector_title = Div(text="""X Axis:""")
|
x_axis_selector_title = bw.Div(text="""X Axis:""")
|
||||||
x_axis_selector = RadioButtonGroup(labels=x_axis_options, active=0)
|
x_axis_selector = bw.RadioButtonGroup(labels=x_axis_options, active=0)
|
||||||
x_axis_selector.on_click(change_x_axis)
|
x_axis_selector.on_click(change_x_axis)
|
||||||
|
|
||||||
# toggle second axis button
|
# toggle second axis bw.button
|
||||||
toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success")
|
toggle_second_axis_button = bw.Button(label="Toggle Second Axis", button_type="success")
|
||||||
toggle_second_axis_button.on_click(toggle_second_axis)
|
toggle_second_axis_button.on_click(toggle_second_axis)
|
||||||
|
|
||||||
# averaging slider
|
# averaging slider
|
||||||
averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10)
|
averaging_slider = bw.Slider(title="Averaging window", start=1, end=101, step=10)
|
||||||
averaging_slider.on_change('value', update_averaging)
|
averaging_slider.on_change('value', update_averaging)
|
||||||
|
|
||||||
# group properties checkbox
|
# group properties checkbox
|
||||||
group_cb = CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[])
|
group_cb = bw.CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[])
|
||||||
group_cb.on_click(toggle_group_property)
|
group_cb.on_click(toggle_group_property)
|
||||||
|
|
||||||
# color selector
|
# color selector
|
||||||
color_selector_title = Div(text="""Select Color:""")
|
color_selector_title = bw.Div(text="""Select Color:""")
|
||||||
crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
|
crsource = bm.ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs))
|
||||||
color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10),
|
color_selector = bp.figure(x_range=(0, color_resolution), y_range=(0, 10),
|
||||||
plot_width=300, plot_height=40,
|
plot_width=300, plot_height=40,
|
||||||
tools='tap')
|
tools='tap')
|
||||||
color_selector.axis.visible = False
|
color_selector.axis.visible = False
|
||||||
@@ -920,43 +915,43 @@ color_selector.toolbar.logo = None
|
|||||||
color_selector.toolbar_location = None
|
color_selector.toolbar_location = None
|
||||||
|
|
||||||
# title
|
# title
|
||||||
title = Div(text="""<h1>Coach Dashboard</h1>""")
|
title = bw.Div(text="""<h1>Coach Dashboard</h1>""")
|
||||||
|
|
||||||
# landing page
|
# landing page
|
||||||
landing_page_description = Div(text="""<h3>Start by selecting an experiment file or directory to open:</h3>""")
|
landing_page_description = bw.Div(text="""<h3>Start by selecting an experiment file or directory to open:</h3>""")
|
||||||
center = Div(text="""<style>html { text-align: center; } </style>""")
|
center = bw.Div(text="""<style>html { text-align: center; } </style>""")
|
||||||
center_buttons = Div(text="""<style>.bk-grid-row .bk-layout-fixed { margin: 0 auto; }</style>""", width=0)
|
center_buttons = bw.Div(text="""<style>.bk-grid-row .bk-layout-fixed { margin: 0 auto; }</style>""", width=0)
|
||||||
landing_page = column(center,
|
landing_page = bl.column(center,
|
||||||
title,
|
title,
|
||||||
landing_page_description,
|
landing_page_description,
|
||||||
row(center_buttons),
|
bl.row(center_buttons),
|
||||||
row(file_selection_button, sizing_mode='scale_width'),
|
bl.row(file_selection_button, sizing_mode='scale_width'),
|
||||||
row(group_selection_button, sizing_mode='scale_width'),
|
bl.row(group_selection_button, sizing_mode='scale_width'),
|
||||||
sizing_mode='scale_width')
|
sizing_mode='scale_width')
|
||||||
|
|
||||||
# main layout of the document
|
# main layout of the document
|
||||||
layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
|
layout = bl.row(file_selection_button, files_selector_spacer, group_selection_button, width=300)
|
||||||
layout = column(layout, files_selector)
|
layout = bl.column(layout, files_selector)
|
||||||
layout = column(layout, row(refresh_info, unload_file_button))
|
layout = bl.column(layout, bl.row(refresh_info, unload_file_button))
|
||||||
layout = column(layout, data_selector)
|
layout = bl.column(layout, data_selector)
|
||||||
layout = column(layout, color_selector_title)
|
layout = bl.column(layout, color_selector_title)
|
||||||
layout = column(layout, color_selector)
|
layout = bl.column(layout, color_selector)
|
||||||
layout = column(layout, x_axis_selector_title)
|
layout = bl.column(layout, x_axis_selector_title)
|
||||||
layout = column(layout, x_axis_selector)
|
layout = bl.column(layout, x_axis_selector)
|
||||||
layout = column(layout, group_cb)
|
layout = bl.column(layout, group_cb)
|
||||||
layout = column(layout, toggle_second_axis_button)
|
layout = bl.column(layout, toggle_second_axis_button)
|
||||||
layout = column(layout, averaging_slider)
|
layout = bl.column(layout, averaging_slider)
|
||||||
# layout = column(layout, legend)
|
# layout = bl.column(layout, legend)
|
||||||
layout = row(layout, plot)
|
layout = bl.row(layout, plot)
|
||||||
layout = column(title, layout)
|
layout = bl.column(title, layout)
|
||||||
layout = column(layout, spinner)
|
layout = bl.column(layout, spinner)
|
||||||
|
|
||||||
doc = curdoc()
|
doc = bp.curdoc()
|
||||||
doc.add_root(landing_page)
|
doc.add_root(landing_page)
|
||||||
|
|
||||||
doc.add_periodic_callback(reload_all_files, 20000)
|
doc.add_periodic_callback(reload_all_files, 20000)
|
||||||
plot.y_range = Range1d(0, 100)
|
plot.y_range = bm.Range1d(0, 100)
|
||||||
plot.extra_y_ranges['secondary'] = Range1d(0, 100)
|
plot.extra_y_ranges['secondary'] = bm.Range1d(0, 100)
|
||||||
|
|
||||||
# show load file dialog immediately on start
|
# show load file dialog immediately on start
|
||||||
#doc.add_timeout_callback(load_files, 1000)
|
#doc.add_timeout_callback(load_files, 1000)
|
||||||
|
|||||||
@@ -13,7 +13,6 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|||||||
@@ -24,9 +24,9 @@ Adds support for displaying math formulas using [MathJax](http://www.mathjax.org
|
|||||||
|
|
||||||
Author: 2015, Dmitry Shachnev <mitya57@gmail.com>.
|
Author: 2015, Dmitry Shachnev <mitya57@gmail.com>.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import markdown
|
import markdown
|
||||||
|
|
||||||
|
|
||||||
class MathExtension(markdown.extensions.Extension):
|
class MathExtension(markdown.extensions.Extension):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.config = {
|
self.config = {
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
#
|
#
|
||||||
# Copyright (c) 2017 Intel Corporation
|
# Copyright (c) 2017 Intel Corporation
|
||||||
#
|
#
|
||||||
@@ -13,11 +14,9 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
|
|
||||||
|
|
||||||
long_description = \
|
long_description = \
|
||||||
"""This extension adds math formulas support to Python-Markdown_
|
"""This extension adds math formulas support to Python-Markdown_
|
||||||
(works with version 2.6 or newer).
|
(works with version 2.6 or newer).
|
||||||
|
|||||||
@@ -13,8 +13,10 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import fnmatch
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
import os, fnmatch, sys
|
|
||||||
def findReplace(directory, find, replace, filePattern):
|
def findReplace(directory, find, replace, filePattern):
|
||||||
for path, dirs, files in os.walk(os.path.abspath(directory)):
|
for path, dirs, files in os.walk(os.path.abspath(directory)):
|
||||||
for filename in fnmatch.filter(files, filePattern):
|
for filename in fnmatch.filter(files, filePattern):
|
||||||
@@ -25,6 +27,7 @@ def findReplace(directory, find, replace, filePattern):
|
|||||||
with open(filepath, "w") as f:
|
with open(filepath, "w") as f:
|
||||||
f.write(s)
|
f.write(s)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
findReplace('./site/', '/"', '/index.html"', "*.html")
|
findReplace('./site/', '/"', '/index.html"', "*.html")
|
||||||
findReplace('./site/', '"/index.html"', '"./index.html"', "*.html")
|
findReplace('./site/', '"/index.html"', '"./index.html"', "*.html")
|
||||||
|
|||||||
@@ -13,15 +13,13 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from environments.gym_environment_wrapper import GymEnvironmentWrapper
|
||||||
from logger import *
|
from environments.doom_environment_wrapper import DoomEnvironmentWrapper
|
||||||
from utils import Enum, get_open_port
|
from environments.carla_environment_wrapper import CarlaEnvironmentWrapper
|
||||||
from environments.gym_environment_wrapper import *
|
import utils
|
||||||
from environments.doom_environment_wrapper import *
|
|
||||||
from environments.carla_environment_wrapper import *
|
|
||||||
|
|
||||||
|
|
||||||
class EnvTypes(Enum):
|
class EnvTypes(utils.Enum):
|
||||||
Doom = "DoomEnvironmentWrapper"
|
Doom = "DoomEnvironmentWrapper"
|
||||||
Gym = "GymEnvironmentWrapper"
|
Gym = "GymEnvironmentWrapper"
|
||||||
Carla = "CarlaEnvironmentWrapper"
|
Carla = "CarlaEnvironmentWrapper"
|
||||||
@@ -31,6 +29,3 @@ def create_environment(tuning_parameters):
|
|||||||
env_type_name, env_type = EnvTypes().verify(tuning_parameters.env.type)
|
env_type_name, env_type = EnvTypes().verify(tuning_parameters.env.type)
|
||||||
env = eval(env_type)(tuning_parameters)
|
env = eval(env_type)(tuning_parameters)
|
||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,34 +1,31 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from os import path, environ
|
|
||||||
|
|
||||||
try:
|
|
||||||
if 'CARLA_ROOT' in environ:
|
|
||||||
sys.path.append(path.join(environ.get('CARLA_ROOT'), 'PythonClient'))
|
|
||||||
from carla.client import CarlaClient
|
|
||||||
from carla.settings import CarlaSettings
|
|
||||||
from carla.tcp import TCPConnectionError
|
|
||||||
from carla.sensor import Camera
|
|
||||||
from carla.client import VehicleControl
|
|
||||||
except ImportError:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("CARLA")
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
|
||||||
import logging
|
import logger
|
||||||
import subprocess
|
try:
|
||||||
import signal
|
if 'CARLA_ROOT' in os.environ:
|
||||||
from environments.environment_wrapper import EnvironmentWrapper
|
sys.path.append(os.path.join(os.environ.get('CARLA_ROOT'),
|
||||||
from utils import *
|
'PythonClient'))
|
||||||
from logger import screen, logger
|
from carla import client as carla_client
|
||||||
from PIL import Image
|
from carla import settings as carla_settings
|
||||||
|
from carla import sensor as carla_sensor
|
||||||
|
except ImportError:
|
||||||
|
logger.failed_imports.append("CARLA")
|
||||||
|
from environments import environment_wrapper as ew
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
# enum of the available levels and their path
|
# enum of the available levels and their path
|
||||||
class CarlaLevel(Enum):
|
class CarlaLevel(utils.Enum):
|
||||||
TOWN1 = "/Game/Maps/Town01"
|
TOWN1 = "/Game/Maps/Town01"
|
||||||
TOWN2 = "/Game/Maps/Town02"
|
TOWN2 = "/Game/Maps/Town02"
|
||||||
|
|
||||||
|
|
||||||
key_map = {
|
key_map = {
|
||||||
'BRAKE': (274,), # down arrow
|
'BRAKE': (274,), # down arrow
|
||||||
'GAS': (273,), # up arrow
|
'GAS': (273,), # up arrow
|
||||||
@@ -41,16 +38,16 @@ key_map = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
class CarlaEnvironmentWrapper(ew.EnvironmentWrapper):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
EnvironmentWrapper.__init__(self, tuning_parameters)
|
ew.EnvironmentWrapper.__init__(self, tuning_parameters)
|
||||||
|
|
||||||
self.tp = tuning_parameters
|
self.tp = tuning_parameters
|
||||||
|
|
||||||
# server configuration
|
# server configuration
|
||||||
self.server_height = self.tp.env.server_height
|
self.server_height = self.tp.env.server_height
|
||||||
self.server_width = self.tp.env.server_width
|
self.server_width = self.tp.env.server_width
|
||||||
self.port = get_open_port()
|
self.port = utils.get_open_port()
|
||||||
self.host = 'localhost'
|
self.host = 'localhost'
|
||||||
self.map = CarlaLevel().get(self.tp.env.level)
|
self.map = CarlaLevel().get(self.tp.env.level)
|
||||||
|
|
||||||
@@ -70,7 +67,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
self.settings = fp.read()
|
self.settings = fp.read()
|
||||||
else:
|
else:
|
||||||
# hard coded settings
|
# hard coded settings
|
||||||
self.settings = CarlaSettings()
|
self.settings = carla_settings.CarlaSettings()
|
||||||
self.settings.set(
|
self.settings.set(
|
||||||
SynchronousMode=True,
|
SynchronousMode=True,
|
||||||
SendNonPlayerAgentsInfo=False,
|
SendNonPlayerAgentsInfo=False,
|
||||||
@@ -80,7 +77,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
self.settings.randomize_seeds()
|
self.settings.randomize_seeds()
|
||||||
|
|
||||||
# add cameras
|
# add cameras
|
||||||
camera = Camera('CameraRGB')
|
camera = carla_sensor.Camera('CameraRGB')
|
||||||
camera.set_image_size(self.width, self.height)
|
camera.set_image_size(self.width, self.height)
|
||||||
camera.set_position(200, 0, 140)
|
camera.set_position(200, 0, 140)
|
||||||
camera.set_rotation(0, 0, 0)
|
camera.set_rotation(0, 0, 0)
|
||||||
@@ -92,7 +89,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
logging.disable(40)
|
logging.disable(40)
|
||||||
|
|
||||||
# open the client
|
# open the client
|
||||||
self.game = CarlaClient(self.host, self.port, timeout=99999999)
|
self.game = carla_client.CarlaClient(self.host, self.port, timeout=99999999)
|
||||||
self.game.connect()
|
self.game.connect()
|
||||||
scene = self.game.load_settings(self.settings)
|
scene = self.game.load_settings(self.settings)
|
||||||
|
|
||||||
@@ -141,9 +138,9 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
self.renderer.create_screen(image.shape[1], image.shape[0])
|
self.renderer.create_screen(image.shape[1], image.shape[0])
|
||||||
|
|
||||||
def _open_server(self):
|
def _open_server(self):
|
||||||
log_path = path.join(logger.experiments_path, "CARLA_LOG_{}.txt".format(self.port))
|
log_path = os.path.join(logger.logger.experiments_path, "CARLA_LOG_{}.txt".format(self.port))
|
||||||
with open(log_path, "wb") as out:
|
with open(log_path, "wb") as out:
|
||||||
cmd = [path.join(environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map,
|
cmd = [os.path.join(os.environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map,
|
||||||
"-benchmark", "-carla-server", "-fps=10", "-world-port={}".format(self.port),
|
"-benchmark", "-carla-server", "-fps=10", "-world-port={}".format(self.port),
|
||||||
"-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height),
|
"-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height),
|
||||||
"-carla-no-hud"]
|
"-carla-no-hud"]
|
||||||
@@ -201,7 +198,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
action = action_idx
|
action = action_idx
|
||||||
self.last_action_idx = action
|
self.last_action_idx = action
|
||||||
|
|
||||||
self.control = VehicleControl()
|
self.control = carla_client.VehicleControl()
|
||||||
self.control.throttle = np.clip(action[0], 0, 1)
|
self.control.throttle = np.clip(action[0], 0, 1)
|
||||||
self.control.steer = np.clip(action[1], -1, 1)
|
self.control.steer = np.clip(action[1], -1, 1)
|
||||||
self.control.brake = np.abs(np.clip(action[0], -1, 0))
|
self.control.brake = np.abs(np.clip(action[0], -1, 0))
|
||||||
|
|||||||
@@ -13,23 +13,23 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import enum
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import logger
|
||||||
try:
|
try:
|
||||||
import vizdoom
|
import vizdoom
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from logger import failed_imports
|
logger.failed_imports.append("ViZDoom")
|
||||||
failed_imports.append("ViZDoom")
|
|
||||||
|
|
||||||
import numpy as np
|
from environments import environment_wrapper as ew
|
||||||
from environments.environment_wrapper import EnvironmentWrapper
|
import utils
|
||||||
from os import path, environ
|
|
||||||
from utils import *
|
|
||||||
from logger import *
|
|
||||||
|
|
||||||
|
|
||||||
# enum of the available levels and their path
|
# enum of the available levels and their path
|
||||||
class DoomLevel(Enum):
|
class DoomLevel(utils.Enum):
|
||||||
BASIC = "basic.cfg"
|
BASIC = "basic.cfg"
|
||||||
DEFEND = "defend_the_center.cfg"
|
DEFEND = "defend_the_center.cfg"
|
||||||
DEATHMATCH = "deathmatch.cfg"
|
DEATHMATCH = "deathmatch.cfg"
|
||||||
@@ -40,6 +40,7 @@ class DoomLevel(Enum):
|
|||||||
DEFEND_THE_LINE = "defend_the_line.cfg"
|
DEFEND_THE_LINE = "defend_the_line.cfg"
|
||||||
DEADLY_CORRIDOR = "deadly_corridor.cfg"
|
DEADLY_CORRIDOR = "deadly_corridor.cfg"
|
||||||
|
|
||||||
|
|
||||||
key_map = {
|
key_map = {
|
||||||
'NO-OP': 96, # `
|
'NO-OP': 96, # `
|
||||||
'ATTACK': 13, # enter
|
'ATTACK': 13, # enter
|
||||||
@@ -78,15 +79,16 @@ key_map = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class DoomEnvironmentWrapper(EnvironmentWrapper):
|
class DoomEnvironmentWrapper(ew.EnvironmentWrapper):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
EnvironmentWrapper.__init__(self, tuning_parameters)
|
ew.EnvironmentWrapper.__init__(self, tuning_parameters)
|
||||||
|
|
||||||
# load the emulator with the required level
|
# load the emulator with the required level
|
||||||
self.level = DoomLevel().get(self.tp.env.level)
|
self.level = DoomLevel().get(self.tp.env.level)
|
||||||
self.scenarios_dir = path.join(environ.get('VIZDOOM_ROOT'), 'scenarios')
|
self.scenarios_dir = os.path.join(os.environ.get('VIZDOOM_ROOT'),
|
||||||
|
'scenarios')
|
||||||
self.game = vizdoom.DoomGame()
|
self.game = vizdoom.DoomGame()
|
||||||
self.game.load_config(path.join(self.scenarios_dir, self.level))
|
self.game.load_config(os.path.join(self.scenarios_dir, self.level))
|
||||||
self.game.set_window_visible(False)
|
self.game.set_window_visible(False)
|
||||||
self.game.add_game_args("+vid_forcesurface 1")
|
self.game.add_game_args("+vid_forcesurface 1")
|
||||||
|
|
||||||
|
|||||||
@@ -13,14 +13,14 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from utils import *
|
|
||||||
from configurations import Preset
|
|
||||||
from renderer import Renderer
|
|
||||||
import operator
|
import operator
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import renderer
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class EnvironmentWrapper(object):
|
class EnvironmentWrapper(object):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
@@ -50,7 +50,7 @@ class EnvironmentWrapper(object):
|
|||||||
self.height = 1
|
self.height = 1
|
||||||
self.is_state_type_image = True
|
self.is_state_type_image = True
|
||||||
self.measurements_size = 0
|
self.measurements_size = 0
|
||||||
self.phase = RunPhase.TRAIN
|
self.phase = utils.RunPhase.TRAIN
|
||||||
self.tp = tuning_parameters
|
self.tp = tuning_parameters
|
||||||
self.record_video_every = self.tp.visualization.record_video_every
|
self.record_video_every = self.tp.visualization.record_video_every
|
||||||
self.env_id = self.tp.env.level
|
self.env_id = self.tp.env.level
|
||||||
@@ -62,7 +62,7 @@ class EnvironmentWrapper(object):
|
|||||||
self.wait_for_explicit_human_action = False
|
self.wait_for_explicit_human_action = False
|
||||||
self.is_rendered = self.is_rendered or self.human_control
|
self.is_rendered = self.is_rendered or self.human_control
|
||||||
self.game_is_open = True
|
self.game_is_open = True
|
||||||
self.renderer = Renderer()
|
self.renderer = renderer.Renderer()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def measurements(self):
|
def measurements(self):
|
||||||
|
|||||||
@@ -13,40 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import random
|
||||||
|
|
||||||
import sys
|
|
||||||
from logger import *
|
|
||||||
import gym
|
import gym
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
|
||||||
import random
|
|
||||||
try:
|
|
||||||
import roboschool
|
|
||||||
from OpenGL import GL
|
|
||||||
except ImportError:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("RoboSchool")
|
|
||||||
|
|
||||||
try:
|
from environments import environment_wrapper as ew
|
||||||
from gym_extensions.continuous import mujoco
|
import utils
|
||||||
except:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("GymExtensions")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import pybullet_envs
|
|
||||||
except ImportError:
|
|
||||||
from logger import failed_imports
|
|
||||||
failed_imports.append("PyBullet")
|
|
||||||
|
|
||||||
from gym import wrappers
|
|
||||||
from utils import force_list, RunPhase
|
|
||||||
from environments.environment_wrapper import EnvironmentWrapper
|
|
||||||
|
|
||||||
|
|
||||||
class GymEnvironmentWrapper(EnvironmentWrapper):
|
class GymEnvironmentWrapper(ew.EnvironmentWrapper):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
EnvironmentWrapper.__init__(self, tuning_parameters)
|
ew.EnvironmentWrapper.__init__(self, tuning_parameters)
|
||||||
|
|
||||||
# env parameters
|
# env parameters
|
||||||
if ':' in self.env_id:
|
if ':' in self.env_id:
|
||||||
@@ -124,7 +102,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
|
|
||||||
def _update_state(self):
|
def _update_state(self):
|
||||||
if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
|
if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'):
|
||||||
if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'):
|
if self.phase == utils.RunPhase.TRAIN and hasattr(self, 'current_ale_lives'):
|
||||||
# signal termination for life loss
|
# signal termination for life loss
|
||||||
if self.current_ale_lives != self.env.env.ale.lives():
|
if self.current_ale_lives != self.env.env.ale.lives():
|
||||||
self.done = True
|
self.done = True
|
||||||
@@ -146,7 +124,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper):
|
|||||||
if type(action_idx) == int and action_idx == 0:
|
if type(action_idx) == int and action_idx == 0:
|
||||||
# deal with the "reset" action 0
|
# deal with the "reset" action 0
|
||||||
action = [0] * self.env.action_space.shape[0]
|
action = [0] * self.env.action_space.shape[0]
|
||||||
action = np.array(force_list(action))
|
action = np.array(utils.force_list(action))
|
||||||
# removing redundant dimensions such that the action size will match the expected action size from gym
|
# removing redundant dimensions such that the action size will match the expected action size from gym
|
||||||
if action.shape != self.env.action_space.shape:
|
if action.shape != self.env.action_space.shape:
|
||||||
action = np.squeeze(action)
|
action = np.squeeze(action)
|
||||||
|
|||||||
@@ -13,16 +13,29 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from exploration_policies.additive_noise import AdditiveNoise
|
||||||
|
from exploration_policies.approximated_thompson_sampling_using_dropout import ApproximatedThompsonSamplingUsingDropout
|
||||||
|
from exploration_policies.bayesian import Bayesian
|
||||||
|
from exploration_policies.boltzmann import Boltzmann
|
||||||
|
from exploration_policies.bootstrapped import Bootstrapped
|
||||||
|
from exploration_policies.categorical import Categorical
|
||||||
|
from exploration_policies.continuous_entropy import ContinuousEntropy
|
||||||
|
from exploration_policies.e_greedy import EGreedy
|
||||||
|
from exploration_policies.exploration_policy import ExplorationPolicy
|
||||||
|
from exploration_policies.greedy import Greedy
|
||||||
|
from exploration_policies.ou_process import OUProcess
|
||||||
|
from exploration_policies.thompson_sampling import ThompsonSampling
|
||||||
|
|
||||||
from exploration_policies.additive_noise import *
|
|
||||||
from exploration_policies.approximated_thompson_sampling_using_dropout import *
|
__all__ = [AdditiveNoise,
|
||||||
from exploration_policies.bayesian import *
|
ApproximatedThompsonSamplingUsingDropout,
|
||||||
from exploration_policies.boltzmann import *
|
Bayesian,
|
||||||
from exploration_policies.bootstrapped import *
|
Boltzmann,
|
||||||
from exploration_policies.categorical import *
|
Bootstrapped,
|
||||||
from exploration_policies.continuous_entropy import *
|
Categorical,
|
||||||
from exploration_policies.e_greedy import *
|
ContinuousEntropy,
|
||||||
from exploration_policies.exploration_policy import *
|
EGreedy,
|
||||||
from exploration_policies.greedy import *
|
ExplorationPolicy,
|
||||||
from exploration_policies.ou_process import *
|
Greedy,
|
||||||
from exploration_policies.thompson_sampling import *
|
OUProcess,
|
||||||
|
ThompsonSampling]
|
||||||
|
|||||||
@@ -13,18 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from exploration_policies.exploration_policy import *
|
|
||||||
|
from exploration_policies import exploration_policy
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class AdditiveNoise(ExplorationPolicy):
|
class AdditiveNoise(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.variance = tuning_parameters.exploration.initial_noise_variance_percentage
|
self.variance = tuning_parameters.exploration.initial_noise_variance_percentage
|
||||||
self.final_variance = tuning_parameters.exploration.final_noise_variance_percentage
|
self.final_variance = tuning_parameters.exploration.final_noise_variance_percentage
|
||||||
self.decay_steps = tuning_parameters.exploration.noise_variance_decay_steps
|
self.decay_steps = tuning_parameters.exploration.noise_variance_decay_steps
|
||||||
@@ -37,7 +38,7 @@ class AdditiveNoise(ExplorationPolicy):
|
|||||||
self.variance = self.final_variance
|
self.variance = self.final_variance
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
if self.phase == RunPhase.TRAIN:
|
if self.phase == utils.RunPhase.TRAIN:
|
||||||
self.decay_exploration()
|
self.decay_exploration()
|
||||||
action = np.random.normal(action_values, 2 * self.variance * self.action_abs_range)
|
action = np.random.normal(action_values, 2 * self.variance * self.action_abs_range)
|
||||||
return action #np.clip(action, -self.action_abs_range, self.action_abs_range).squeeze()
|
return action #np.clip(action, -self.action_abs_range, self.action_abs_range).squeeze()
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
|
||||||
|
|
||||||
class ApproximatedThompsonSamplingUsingDropout(ExplorationPolicy):
|
class ApproximatedThompsonSamplingUsingDropout(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.dropout_discard_probability = tuning_parameters.exploration.dropout_discard_probability
|
self.dropout_discard_probability = tuning_parameters.exploration.dropout_discard_probability
|
||||||
self.network = tuning_parameters.network
|
self.network = tuning_parameters.network
|
||||||
self.assign_op = self.network.dropout_discard_probability.assign(self.dropout_discard_probability)
|
self.assign_op = self.network.dropout_discard_probability.assign(self.dropout_discard_probability)
|
||||||
|
|||||||
@@ -13,18 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
import tensorflow as tf
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class Bayesian(ExplorationPolicy):
|
class Bayesian(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.keep_probability = tuning_parameters.exploration.initial_keep_probability
|
self.keep_probability = tuning_parameters.exploration.initial_keep_probability
|
||||||
self.final_keep_probability = tuning_parameters.exploration.final_keep_probability
|
self.final_keep_probability = tuning_parameters.exploration.final_keep_probability
|
||||||
self.keep_probability_decay_delta = (
|
self.keep_probability_decay_delta = (
|
||||||
@@ -40,7 +41,7 @@ class Bayesian(ExplorationPolicy):
|
|||||||
self.keep_probability -= self.keep_probability_decay_delta
|
self.keep_probability -= self.keep_probability_decay_delta
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
if self.phase == RunPhase.TRAIN:
|
if self.phase == utils.RunPhase.TRAIN:
|
||||||
self.decay_keep_probability()
|
self.decay_keep_probability()
|
||||||
# dropout = self.network.get_layer('variable_dropout_1')
|
# dropout = self.network.get_layer('variable_dropout_1')
|
||||||
# with tf.Session() as sess:
|
# with tf.Session() as sess:
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
import utils
|
||||||
|
|
||||||
|
class Boltzmann(exploration_policy.ExplorationPolicy):
|
||||||
class Boltzmann(ExplorationPolicy):
|
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.temperature = tuning_parameters.exploration.initial_temperature
|
self.temperature = tuning_parameters.exploration.initial_temperature
|
||||||
self.final_temperature = tuning_parameters.exploration.final_temperature
|
self.final_temperature = tuning_parameters.exploration.final_temperature
|
||||||
self.temperature_decay_delta = (
|
self.temperature_decay_delta = (
|
||||||
@@ -35,7 +36,7 @@ class Boltzmann(ExplorationPolicy):
|
|||||||
self.temperature -= self.temperature_decay_delta
|
self.temperature -= self.temperature_decay_delta
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
if self.phase == RunPhase.TRAIN:
|
if self.phase == utils.RunPhase.TRAIN:
|
||||||
self.decay_temperature()
|
self.decay_temperature()
|
||||||
# softmax calculation
|
# softmax calculation
|
||||||
exp_probabilities = np.exp(action_values / self.temperature)
|
exp_probabilities = np.exp(action_values / self.temperature)
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.e_greedy import *
|
from exploration_policies import e_greedy
|
||||||
|
|
||||||
|
|
||||||
class Bootstrapped(EGreedy):
|
class Bootstrapped(e_greedy.EGreedy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running parameters
|
:param tuning_parameters: A Preset class instance with all the running parameters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
EGreedy.__init__(self, tuning_parameters)
|
e_greedy.EGreedy.__init__(self, tuning_parameters)
|
||||||
self.num_heads = tuning_parameters.exploration.architecture_num_q_heads
|
self.num_heads = tuning_parameters.exploration.architecture_num_q_heads
|
||||||
self.selected_head = 0
|
self.selected_head = 0
|
||||||
|
|
||||||
@@ -31,7 +32,7 @@ class Bootstrapped(EGreedy):
|
|||||||
self.selected_head = np.random.randint(self.num_heads)
|
self.selected_head = np.random.randint(self.num_heads)
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
return EGreedy.get_action(self, action_values[self.selected_head])
|
return e_greedy.EGreedy.get_action(self, action_values[self.selected_head])
|
||||||
|
|
||||||
def get_control_param(self):
|
def get_control_param(self):
|
||||||
return self.selected_head
|
return self.selected_head
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
|
||||||
|
|
||||||
class Categorical(ExplorationPolicy):
|
class Categorical(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
# choose actions according to the probabilities
|
# choose actions according to the probabilities
|
||||||
|
|||||||
@@ -13,10 +13,8 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from exploration_policies import exploration_policy
|
||||||
import numpy as np
|
|
||||||
from exploration_policies.exploration_policy import *
|
|
||||||
|
|
||||||
|
|
||||||
class ContinuousEntropy(ExplorationPolicy):
|
class ContinuousEntropy(exploration_policy.ExplorationPolicy):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -13,17 +13,19 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
import utils
|
||||||
|
|
||||||
|
|
||||||
class EGreedy(ExplorationPolicy):
|
class EGreedy(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.epsilon = tuning_parameters.exploration.initial_epsilon
|
self.epsilon = tuning_parameters.exploration.initial_epsilon
|
||||||
self.final_epsilon = tuning_parameters.exploration.final_epsilon
|
self.final_epsilon = tuning_parameters.exploration.final_epsilon
|
||||||
self.epsilon_decay_delta = (
|
self.epsilon_decay_delta = (
|
||||||
@@ -52,9 +54,9 @@ class EGreedy(ExplorationPolicy):
|
|||||||
self.variance = self.final_variance
|
self.variance = self.final_variance
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
if self.phase == RunPhase.TRAIN:
|
if self.phase == utils.RunPhase.TRAIN:
|
||||||
self.decay_exploration()
|
self.decay_exploration()
|
||||||
epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon
|
epsilon = self.evaluation_epsilon if self.phase == utils.RunPhase.TEST else self.epsilon
|
||||||
|
|
||||||
if self.discrete_controls:
|
if self.discrete_controls:
|
||||||
top_action = np.argmax(action_values)
|
top_action = np.argmax(action_values)
|
||||||
@@ -67,4 +69,4 @@ class EGreedy(ExplorationPolicy):
|
|||||||
return np.squeeze(action_values + (np.random.rand() < epsilon) * noise)
|
return np.squeeze(action_values + (np.random.rand() < epsilon) * noise)
|
||||||
|
|
||||||
def get_control_param(self):
|
def get_control_param(self):
|
||||||
return self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon
|
return self.evaluation_epsilon if self.phase == utils.RunPhase.TEST else self.epsilon
|
||||||
|
|||||||
@@ -13,10 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import utils
|
||||||
import numpy as np
|
|
||||||
from utils import *
|
|
||||||
from configurations import *
|
|
||||||
|
|
||||||
|
|
||||||
class ExplorationPolicy(object):
|
class ExplorationPolicy(object):
|
||||||
@@ -25,7 +22,7 @@ class ExplorationPolicy(object):
|
|||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
self.phase = RunPhase.HEATUP
|
self.phase = utils.RunPhase.HEATUP
|
||||||
self.action_space_size = tuning_parameters.env.action_space_size
|
self.action_space_size = tuning_parameters.env.action_space_size
|
||||||
self.action_abs_range = tuning_parameters.env_instance.action_space_abs_range
|
self.action_abs_range = tuning_parameters.env_instance.action_space_abs_range
|
||||||
self.discrete_controls = tuning_parameters.env_instance.discrete_controls
|
self.discrete_controls = tuning_parameters.env_instance.discrete_controls
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
|
||||||
|
|
||||||
class Greedy(ExplorationPolicy):
|
class Greedy(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
return np.argmax(action_values)
|
return np.argmax(action_values)
|
||||||
|
|||||||
@@ -13,21 +13,21 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from exploration_policies.exploration_policy import *
|
|
||||||
|
from exploration_policies import exploration_policy
|
||||||
|
|
||||||
# Based on on the description in:
|
# Based on on the description in:
|
||||||
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
|
# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
|
||||||
|
|
||||||
# Ornstein-Uhlenbeck process
|
# Ornstein-Uhlenbeck process
|
||||||
class OUProcess(ExplorationPolicy):
|
class OUProcess(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.action_space_size = tuning_parameters.env.action_space_size
|
self.action_space_size = tuning_parameters.env.action_space_size
|
||||||
self.mu = float(tuning_parameters.exploration.mu) * np.ones(self.action_space_size)
|
self.mu = float(tuning_parameters.exploration.mu) * np.ones(self.action_space_size)
|
||||||
self.theta = tuning_parameters.exploration.theta
|
self.theta = tuning_parameters.exploration.theta
|
||||||
|
|||||||
@@ -13,17 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
from exploration_policies.exploration_policy import *
|
from exploration_policies import exploration_policy
|
||||||
|
|
||||||
|
|
||||||
class ThompsonSampling(ExplorationPolicy):
|
class ThompsonSampling(exploration_policy.ExplorationPolicy):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
ExplorationPolicy.__init__(self, tuning_parameters)
|
exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters)
|
||||||
self.action_space_size = tuning_parameters.env.action_space_size
|
self.action_space_size = tuning_parameters.env.action_space_size
|
||||||
|
|
||||||
def get_action(self, action_values):
|
def get_action(self, action_values):
|
||||||
|
|||||||
23
logger.py
23
logger.py
@@ -13,19 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import datetime
|
||||||
from pandas import *
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from pprint import pprint
|
|
||||||
import threading
|
|
||||||
from subprocess import Popen, PIPE
|
|
||||||
import time
|
|
||||||
import datetime
|
|
||||||
from six.moves import input
|
|
||||||
from PIL import Image
|
|
||||||
from typing import Union
|
|
||||||
import shutil
|
import shutil
|
||||||
|
import time
|
||||||
|
import typing
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
import PIL
|
||||||
|
from six.moves import input
|
||||||
|
|
||||||
global failed_imports
|
global failed_imports
|
||||||
failed_imports = []
|
failed_imports = []
|
||||||
@@ -90,7 +87,7 @@ class ScreenLogger(object):
|
|||||||
def ask_input(self, title):
|
def ask_input(self, title):
|
||||||
return input("{}{}{}".format(Colors.BG_CYAN, title, Colors.END))
|
return input("{}{}{}".format(Colors.BG_CYAN, title, Colors.END))
|
||||||
|
|
||||||
def ask_yes_no(self, title: str, default: Union[None, bool]=None):
|
def ask_yes_no(self, title: str, default: typing.Union[None, bool]=None):
|
||||||
"""
|
"""
|
||||||
Ask the user for a yes / no question and return True if the answer is yes and False otherwise.
|
Ask the user for a yes / no question and return True if the answer is yes and False otherwise.
|
||||||
The function will keep asking the user for an answer until he answers one of the possible responses.
|
The function will keep asking the user for an answer until he answers one of the possible responses.
|
||||||
@@ -156,7 +153,7 @@ class BaseLogger(object):
|
|||||||
class Logger(BaseLogger):
|
class Logger(BaseLogger):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
BaseLogger.__init__(self)
|
BaseLogger.__init__(self)
|
||||||
self.data = DataFrame()
|
self.data = pandas.DataFrame()
|
||||||
self.csv_path = ''
|
self.csv_path = ''
|
||||||
self.doc_path = ''
|
self.doc_path = ''
|
||||||
self.aggregated_data_across_threads = None
|
self.aggregated_data_across_threads = None
|
||||||
@@ -249,7 +246,7 @@ class Logger(BaseLogger):
|
|||||||
if not os.path.exists(output_dir):
|
if not os.path.exists(output_dir):
|
||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
output_path = os.path.join(output_dir, output_file)
|
output_path = os.path.join(output_dir, output_file)
|
||||||
pil_images = [Image.fromarray(image) for image in images]
|
pil_images = [PIL.Image.fromarray(image) for image in images]
|
||||||
pil_images[0].save(output_path, save_all=True, append_images=pil_images[1:], duration=1.0 / fps, loop=0)
|
pil_images[0].save(output_path, save_all=True, append_images=pil_images[1:], duration=1.0 / fps, loop=0)
|
||||||
|
|
||||||
def remove_experiment_dir(self):
|
def remove_experiment_dir(self):
|
||||||
|
|||||||
@@ -13,7 +13,18 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
from memories.differentiable_neural_dictionary import AnnoyDictionary
|
||||||
|
from memories.differentiable_neural_dictionary import AnnoyIndex
|
||||||
|
from memories.differentiable_neural_dictionary import QDND
|
||||||
|
from memories.episodic_experience_replay import EpisodicExperienceReplay
|
||||||
|
from memories.memory import Episode
|
||||||
|
from memories.memory import Memory
|
||||||
|
from memories.memory import Transition
|
||||||
|
|
||||||
from memories.differentiable_neural_dictionary import *
|
__all__ = [AnnoyDictionary,
|
||||||
from memories.episodic_experience_replay import *
|
AnnoyIndex,
|
||||||
from memories.memory import *
|
Episode,
|
||||||
|
EpisodicExperienceReplay,
|
||||||
|
Memory,
|
||||||
|
QDND,
|
||||||
|
Transition]
|
||||||
|
|||||||
@@ -13,10 +13,11 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from annoy import AnnoyIndex
|
from annoy import AnnoyIndex
|
||||||
import os, pickle
|
|
||||||
|
|
||||||
|
|
||||||
class AnnoyDictionary(object):
|
class AnnoyDictionary(object):
|
||||||
|
|||||||
@@ -13,24 +13,25 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import typing
|
||||||
|
|
||||||
from memories.memory import *
|
import numpy as np
|
||||||
import threading
|
|
||||||
from typing import Union
|
from memories import memory
|
||||||
|
|
||||||
|
|
||||||
class EpisodicExperienceReplay(Memory):
|
class EpisodicExperienceReplay(memory.Memory):
|
||||||
def __init__(self, tuning_parameters):
|
def __init__(self, tuning_parameters):
|
||||||
"""
|
"""
|
||||||
:param tuning_parameters: A Preset class instance with all the running paramaters
|
:param tuning_parameters: A Preset class instance with all the running paramaters
|
||||||
:type tuning_parameters: Preset
|
:type tuning_parameters: Preset
|
||||||
"""
|
"""
|
||||||
Memory.__init__(self, tuning_parameters)
|
memory.Memory.__init__(self, tuning_parameters)
|
||||||
self.tp = tuning_parameters
|
self.tp = tuning_parameters
|
||||||
self.max_size_in_episodes = tuning_parameters.agent.num_episodes_in_experience_replay
|
self.max_size_in_episodes = tuning_parameters.agent.num_episodes_in_experience_replay
|
||||||
self.max_size_in_transitions = tuning_parameters.agent.num_transitions_in_experience_replay
|
self.max_size_in_transitions = tuning_parameters.agent.num_transitions_in_experience_replay
|
||||||
self.discount = tuning_parameters.agent.discount
|
self.discount = tuning_parameters.agent.discount
|
||||||
self.buffer = [Episode()] # list of episodes
|
self.buffer = [memory.Episode()] # list of episodes
|
||||||
self.transitions = []
|
self.transitions = []
|
||||||
self._length = 1
|
self._length = 1
|
||||||
self._num_transitions = 0
|
self._num_transitions = 0
|
||||||
@@ -96,7 +97,7 @@ class EpisodicExperienceReplay(Memory):
|
|||||||
|
|
||||||
def store(self, transition):
|
def store(self, transition):
|
||||||
if len(self.buffer) == 0:
|
if len(self.buffer) == 0:
|
||||||
self.buffer.append(Episode())
|
self.buffer.append(memory.Episode())
|
||||||
last_episode = self.buffer[-1]
|
last_episode = self.buffer[-1]
|
||||||
last_episode.insert(transition)
|
last_episode.insert(transition)
|
||||||
self.transitions.append(transition)
|
self.transitions.append(transition)
|
||||||
@@ -109,7 +110,7 @@ class EpisodicExperienceReplay(Memory):
|
|||||||
n_step_return=self.tp.agent.n_step)
|
n_step_return=self.tp.agent.n_step)
|
||||||
self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead)
|
self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead)
|
||||||
# self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization
|
# self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization
|
||||||
self.buffer.append(Episode())
|
self.buffer.append(memory.Episode())
|
||||||
|
|
||||||
self.enforce_length()
|
self.enforce_length()
|
||||||
|
|
||||||
@@ -148,7 +149,7 @@ class EpisodicExperienceReplay(Memory):
|
|||||||
def get(self, index):
|
def get(self, index):
|
||||||
return self.get_episode(index)
|
return self.get_episode(index)
|
||||||
|
|
||||||
def get_last_complete_episode(self) -> Union[None, Episode]:
|
def get_last_complete_episode(self) -> typing.Union[None, memory.Episode]:
|
||||||
"""
|
"""
|
||||||
Returns the last complete episode in the memory or None if there are no complete episodes
|
Returns the last complete episode in the memory or None if there are no complete episodes
|
||||||
:return: None or the last complete episode
|
:return: None or the last complete episode
|
||||||
@@ -170,7 +171,7 @@ class EpisodicExperienceReplay(Memory):
|
|||||||
|
|
||||||
def clean(self):
|
def clean(self):
|
||||||
self.transitions = []
|
self.transitions = []
|
||||||
self.buffer = [Episode()]
|
self.buffer = [memory.Episode()]
|
||||||
self._length = 1
|
self._length = 1
|
||||||
self._num_transitions = 0
|
self._num_transitions = 0
|
||||||
self._num_transitions_in_complete_episodes = 0
|
self._num_transitions_in_complete_episodes = 0
|
||||||
|
|||||||
@@ -13,10 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import copy
|
|
||||||
from configurations import *
|
|
||||||
|
|
||||||
|
|
||||||
class Memory(object):
|
class Memory(object):
|
||||||
|
|||||||
@@ -13,19 +13,16 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import tensorflow as tf
|
import os
|
||||||
from architectures import *
|
|
||||||
from environments import *
|
|
||||||
from agents import *
|
|
||||||
from utils import *
|
|
||||||
import time
|
import time
|
||||||
import copy
|
|
||||||
from logger import *
|
import tensorflow as tf
|
||||||
from configurations import *
|
|
||||||
from presets import *
|
import agents
|
||||||
import shutil
|
import environments
|
||||||
|
import logger
|
||||||
|
import presets
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
@@ -66,15 +63,15 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
elif args.job_name == "worker":
|
elif args.job_name == "worker":
|
||||||
# get tuning parameters
|
# get tuning parameters
|
||||||
tuning_parameters = json_to_preset(args.load_json_path)
|
tuning_parameters = presets.json_to_preset(args.load_json_path)
|
||||||
|
|
||||||
# dump documentation
|
# dump documentation
|
||||||
if not os.path.exists(tuning_parameters.experiment_path):
|
if not os.path.exists(tuning_parameters.experiment_path):
|
||||||
os.makedirs(tuning_parameters.experiment_path)
|
os.makedirs(tuning_parameters.experiment_path)
|
||||||
if tuning_parameters.evaluate_only:
|
if tuning_parameters.evaluate_only:
|
||||||
logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id, filename='evaluator')
|
logger.logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id, filename='evaluator')
|
||||||
else:
|
else:
|
||||||
logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id)
|
logger.logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id)
|
||||||
|
|
||||||
# multi-threading parameters
|
# multi-threading parameters
|
||||||
tuning_parameters.start_time = start_time
|
tuning_parameters.start_time = start_time
|
||||||
@@ -98,8 +95,8 @@ if __name__ == "__main__":
|
|||||||
cluster=cluster)
|
cluster=cluster)
|
||||||
|
|
||||||
# create the agent and the environment
|
# create the agent and the environment
|
||||||
env_instance = create_environment(tuning_parameters)
|
env_instance = environments.create_environment(tuning_parameters)
|
||||||
exec('agent = ' + tuning_parameters.agent.type + '(env_instance, tuning_parameters, replicated_device=device, '
|
exec('agent = agents.' + tuning_parameters.agent.type + '(env_instance, tuning_parameters, replicated_device=device, '
|
||||||
'thread_id=tuning_parameters.task_id)')
|
'thread_id=tuning_parameters.task_id)')
|
||||||
|
|
||||||
# building the scaffold
|
# building the scaffold
|
||||||
@@ -169,6 +166,6 @@ if __name__ == "__main__":
|
|||||||
else:
|
else:
|
||||||
agent.improve()
|
agent.improve()
|
||||||
else:
|
else:
|
||||||
screen.error("Invalid mode requested for parallel_actor.")
|
logger.screen.error("Invalid mode requested for parallel_actor.")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
from dashboard import SignalsFile
|
from dashboard import SignalsFile
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
class FigureMaker(object):
|
class FigureMaker(object):
|
||||||
|
|||||||
416
presets.py
416
presets.py
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,6 @@
|
|||||||
import pygame
|
|
||||||
from pygame.locals import *
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pygame
|
||||||
|
from pygame import locals as loc
|
||||||
|
|
||||||
|
|
||||||
class Renderer(object):
|
class Renderer(object):
|
||||||
@@ -21,7 +21,8 @@ class Renderer(object):
|
|||||||
:return: None
|
:return: None
|
||||||
"""
|
"""
|
||||||
self.size = (width, height)
|
self.size = (width, height)
|
||||||
self.screen = self.display.set_mode(self.size, HWSURFACE | DOUBLEBUF)
|
self.screen = self.display.set_mode(self.size,
|
||||||
|
loc.HWSURFACE | loc.DOUBLEBUF)
|
||||||
self.display.set_caption("Coach")
|
self.display.set_caption("Coach")
|
||||||
self.is_open = True
|
self.is_open = True
|
||||||
|
|
||||||
|
|||||||
54
run_test.py
54
run_test.py
@@ -13,23 +13,21 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
import argparse
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
import presets
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from os import path
|
|
||||||
import os
|
|
||||||
import glob
|
import glob
|
||||||
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from logger import screen
|
|
||||||
from utils import list_all_classes_in_module, threaded_cmd_line_run, killed_processes
|
|
||||||
import subprocess
|
|
||||||
import signal
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import logger
|
||||||
|
import presets
|
||||||
|
import utils
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
@@ -61,7 +59,7 @@ if __name__ == '__main__':
|
|||||||
if args.preset is not None:
|
if args.preset is not None:
|
||||||
presets_lists = [args.preset]
|
presets_lists = [args.preset]
|
||||||
else:
|
else:
|
||||||
presets_lists = list_all_classes_in_module(presets)
|
presets_lists = utils.list_all_classes_in_module(presets)
|
||||||
win_size = 10
|
win_size = 10
|
||||||
fail_count = 0
|
fail_count = 0
|
||||||
test_count = 0
|
test_count = 0
|
||||||
@@ -70,7 +68,7 @@ if __name__ == '__main__':
|
|||||||
# create a clean experiment directory
|
# create a clean experiment directory
|
||||||
test_name = '__test'
|
test_name = '__test'
|
||||||
test_path = os.path.join('./experiments', test_name)
|
test_path = os.path.join('./experiments', test_name)
|
||||||
if path.exists(test_path):
|
if os.path.exists(test_path):
|
||||||
shutil.rmtree(test_path)
|
shutil.rmtree(test_path)
|
||||||
if args.ignore_presets is not None:
|
if args.ignore_presets is not None:
|
||||||
presets_to_ignore = args.ignore_presets.split(',')
|
presets_to_ignore = args.ignore_presets.split(',')
|
||||||
@@ -100,7 +98,7 @@ if __name__ == '__main__':
|
|||||||
test_count += 1
|
test_count += 1
|
||||||
|
|
||||||
# run the experiment in a separate thread
|
# run the experiment in a separate thread
|
||||||
screen.log_title("Running test {} - {}".format(preset_name, framework))
|
logger.screen.log_title("Running test {} - {}".format(preset_name, framework))
|
||||||
log_file_name = 'test_log_{preset_name}_{framework}.txt'.format(
|
log_file_name = 'test_log_{preset_name}_{framework}.txt'.format(
|
||||||
preset_name=preset_name,
|
preset_name=preset_name,
|
||||||
framework=framework,
|
framework=framework,
|
||||||
@@ -139,7 +137,7 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
tries_counter = 0
|
tries_counter = 0
|
||||||
while not csv_paths:
|
while not csv_paths:
|
||||||
csv_paths = glob.glob(path.join(test_path, '*', filename_pattern))
|
csv_paths = glob.glob(os.path.join(test_path, '*', filename_pattern))
|
||||||
if tries_counter > read_csv_tries:
|
if tries_counter > read_csv_tries:
|
||||||
break
|
break
|
||||||
tries_counter += 1
|
tries_counter += 1
|
||||||
@@ -195,26 +193,26 @@ if __name__ == '__main__':
|
|||||||
# kill test and print result
|
# kill test and print result
|
||||||
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
|
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
|
||||||
if test_passed:
|
if test_passed:
|
||||||
screen.success("Passed successfully")
|
logger.screen.success("Passed successfully")
|
||||||
else:
|
else:
|
||||||
if csv_paths:
|
if csv_paths:
|
||||||
screen.error("Failed due to insufficient reward", crash=False)
|
logger.screen.error("Failed due to insufficient reward", crash=False)
|
||||||
screen.error("preset.test_max_step_threshold: {}".format(preset.test_max_step_threshold), crash=False)
|
logger.screen.error("preset.test_max_step_threshold: {}".format(preset.test_max_step_threshold), crash=False)
|
||||||
screen.error("preset.test_min_return_threshold: {}".format(preset.test_min_return_threshold), crash=False)
|
logger.screen.error("preset.test_min_return_threshold: {}".format(preset.test_min_return_threshold), crash=False)
|
||||||
screen.error("averaged_rewards: {}".format(averaged_rewards), crash=False)
|
logger.screen.error("averaged_rewards: {}".format(averaged_rewards), crash=False)
|
||||||
screen.error("episode number: {}".format(csv['Episode #'].values[-1]), crash=False)
|
logger.screen.error("episode number: {}".format(csv['Episode #'].values[-1]), crash=False)
|
||||||
else:
|
else:
|
||||||
screen.error("csv file never found", crash=False)
|
logger.screen.error("csv file never found", crash=False)
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
screen.error("command exitcode: {}".format(p.returncode), crash=False)
|
logger.screen.error("command exitcode: {}".format(p.returncode), crash=False)
|
||||||
screen.error(open(log_file_name).read(), crash=False)
|
logger.screen.error(open(log_file_name).read(), crash=False)
|
||||||
|
|
||||||
fail_count += 1
|
fail_count += 1
|
||||||
shutil.rmtree(test_path)
|
shutil.rmtree(test_path)
|
||||||
|
|
||||||
|
|
||||||
screen.separator()
|
logger.screen.separator()
|
||||||
if fail_count == 0:
|
if fail_count == 0:
|
||||||
screen.success(" Summary: " + str(test_count) + "/" + str(test_count) + " tests passed successfully")
|
logger.screen.success(" Summary: " + str(test_count) + "/" + str(test_count) + " tests passed successfully")
|
||||||
else:
|
else:
|
||||||
screen.error(" Summary: " + str(test_count - fail_count) + "/" + str(test_count) + " tests passed successfully")
|
logger.screen.error(" Summary: " + str(test_count - fail_count) + "/" + str(test_count) + " tests passed successfully")
|
||||||
|
|||||||
20
utils.py
20
utils.py
@@ -13,20 +13,22 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
import json
|
|
||||||
import inspect
|
|
||||||
import os
|
|
||||||
import numpy as np
|
|
||||||
import threading
|
|
||||||
from subprocess import call, Popen
|
|
||||||
import signal
|
|
||||||
import copy
|
import copy
|
||||||
|
import inspect
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import threading
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
killed_processes = []
|
killed_processes = []
|
||||||
|
|
||||||
eps = np.finfo(np.float32).eps
|
eps = np.finfo(np.float32).eps
|
||||||
|
|
||||||
|
|
||||||
class Enum(object):
|
class Enum(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
@@ -161,7 +163,7 @@ def ClassToDict(x):
|
|||||||
|
|
||||||
|
|
||||||
def cmd_line_run(result, run_cmd, id=-1):
|
def cmd_line_run(result, run_cmd, id=-1):
|
||||||
p = Popen(run_cmd, shell=True, executable="/bin/bash")
|
p = subprocess.Popen(run_cmd, shell=True, executable="/bin/bash")
|
||||||
while result[0] is None or result[0] == [None]:
|
while result[0] is None or result[0] == [None]:
|
||||||
if id in killed_processes:
|
if id in killed_processes:
|
||||||
p.kill()
|
p.kill()
|
||||||
|
|||||||
Reference in New Issue
Block a user