diff --git a/agents/__init__.py b/agents/__init__.py index fdbd13e..6be2538 100644 --- a/agents/__init__.py +++ b/agents/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,26 +13,48 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from agents.actor_critic_agent import ActorCriticAgent +from agents.agent import Agent +from agents.bc_agent import BCAgent +from agents.bootstrapped_dqn_agent import BootstrappedDQNAgent +from agents.categorical_dqn_agent import CategoricalDQNAgent +from agents.clipped_ppo_agent import ClippedPPOAgent +from agents.ddpg_agent import DDPGAgent +from agents.ddqn_agent import DDQNAgent +from agents.dfp_agent import DFPAgent +from agents.dqn_agent import DQNAgent +from agents.human_agent import HumanAgent +from agents.imitation_agent import ImitationAgent +from agents.mmc_agent import MixedMonteCarloAgent +from agents.n_step_q_agent import NStepQAgent +from agents.naf_agent import NAFAgent +from agents.nec_agent import NECAgent +from agents.pal_agent import PALAgent +from agents.policy_gradients_agent import PolicyGradientsAgent +from agents.policy_optimization_agent import PolicyOptimizationAgent +from agents.ppo_agent import PPOAgent +from agents.qr_dqn_agent import QuantileRegressionDQNAgent +from agents.value_optimization_agent import ValueOptimizationAgent -from agents.actor_critic_agent import * -from agents.agent import * -from agents.bc_agent import * -from agents.bootstrapped_dqn_agent import * -from agents.clipped_ppo_agent import * -from agents.ddpg_agent import * -from agents.ddqn_agent import * -from agents.dfp_agent import * -from agents.dqn_agent import * -from agents.categorical_dqn_agent import * -from agents.human_agent import * -from agents.imitation_agent import * -from agents.mmc_agent import * -from agents.n_step_q_agent import * -from agents.naf_agent import * -from agents.nec_agent import * -from agents.pal_agent import * -from agents.policy_gradients_agent import * -from agents.policy_optimization_agent import * -from agents.ppo_agent import * -from agents.value_optimization_agent import * -from agents.qr_dqn_agent import * +__all__ = [ActorCriticAgent, + Agent, + BCAgent, + BootstrappedDQNAgent, + CategoricalDQNAgent, + ClippedPPOAgent, + DDPGAgent, + DDQNAgent, + DFPAgent, + DQNAgent, + HumanAgent, + ImitationAgent, + MixedMonteCarloAgent, + NAFAgent, + NECAgent, + NStepQAgent, + PALAgent, + PPOAgent, + PolicyGradientsAgent, + PolicyOptimizationAgent, + QuantileRegressionDQNAgent, + ValueOptimizationAgent] diff --git a/agents/actor_critic_agent.py b/agents/actor_critic_agent.py index 729e67f..d514acd 100644 --- a/agents/actor_critic_agent.py +++ b/agents/actor_critic_agent.py @@ -13,23 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np +from scipy import signal -from agents.policy_optimization_agent import * -from logger import * -from utils import * -import scipy.signal +from agents import policy_optimization_agent as poa +import utils +import logger # Actor Critic - https://arxiv.org/abs/1602.01783 -class ActorCriticAgent(PolicyOptimizationAgent): +class ActorCriticAgent(poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network) + poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network) self.last_gradient_update_step_idx = 0 - self.action_advantages = Signal('Advantages') - self.state_values = Signal('Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') - self.policy_loss = Signal('Policy Loss') + self.action_advantages = utils.Signal('Advantages') + self.state_values = utils.Signal('Values') + self.unclipped_grads = utils.Signal('Grads (unclipped)') + self.value_loss = utils.Signal('Value Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.action_advantages) self.signals.append(self.state_values) self.signals.append(self.unclipped_grads) @@ -38,7 +39,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): # Discounting function used to calculate discounted returns. def discount(self, x, gamma): - return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + return signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] def get_general_advantage_estimation_values(self, rewards, values): # values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n) @@ -72,20 +73,20 @@ class ActorCriticAgent(PolicyOptimizationAgent): # estimate the advantage function action_advantages = np.zeros((num_transitions, 1)) - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: if game_overs[-1]: R = 0 else: - R = self.main_network.online_network.predict(last_sample(next_states))[0] + R = self.main_network.online_network.predict(utils.last_sample(next_states))[0] for i in reversed(range(num_transitions)): R = rewards[i] + self.tp.agent.discount * R state_value_head_targets[i] = R action_advantages[i] = R - current_state_values[i] - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps - bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0] + bootstrapped_value = self.main_network.online_network.predict(utils.last_sample(next_states))[0] values = np.append(current_state_values, bootstrapped_value) if game_overs[-1]: values[-1] = 0 @@ -94,7 +95,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values) action_advantages = np.vstack(gae_values) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") action_advantages = action_advantages.squeeze(axis=-1) if not self.env.discrete_controls and len(actions.shape) < 2: @@ -113,7 +114,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # TODO: rename curr_state -> state # convert to batch so we can run it through the network @@ -126,7 +127,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): # DISCRETE state_value, action_probabilities = self.main_network.online_network.predict(curr_state) action_probabilities = action_probabilities.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_probabilities) else: action = np.argmax(action_probabilities) @@ -137,7 +138,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) else: action = action_values_mean diff --git a/agents/agent.py b/agents/agent.py index 006544e..c1f5262 100644 --- a/agents/agent.py +++ b/agents/agent.py @@ -13,32 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import scipy.ndimage -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") - -import copy -from renderer import Renderer -from configurations import Preset -from collections import deque -from utils import LazyStack -from collections import OrderedDict -from utils import RunPhase, Signal, is_empty, RunningStat -from architectures import * -from exploration_policies import * -from memories import * -from memories.memory import * -from logger import logger, screen +import collections import random import time -import os -import itertools -from architectures.tensorflow_components.shared_variables import SharedRunningStats + +import logger +try: + import matplotlib.pyplot as plt +except ImportError: + logger.failed_imports.append("matplotlib") + +import numpy as np +from pandas.io import pickle from six.moves import range +import scipy + +from architectures.tensorflow_components import shared_variables as sv +import configurations +import exploration_policies as ep +import memories +from memories import memory +import renderer +import utils class Agent(object): @@ -54,7 +50,7 @@ class Agent(object): :param thread_id: int """ - screen.log_title("Creating agent {}".format(task_id)) + logger.screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env @@ -71,21 +67,20 @@ class Agent(object): # modules if tuning_parameters.agent.load_memory_from_file_path: - screen.log_title("Loading replay buffer from pickle. Pickle path: {}" + logger.screen.log_title("Loading replay buffer from pickle. Pickle path: {}" .format(tuning_parameters.agent.load_memory_from_file_path)) - self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path) + self.memory = pickle.read_pickle(tuning_parameters.agent.load_memory_from_file_path) else: - self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') - # self.architecture = eval(tuning_parameters.architecture) + self.memory = eval('memories.' + tuning_parameters.memory + '(tuning_parameters)') self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0" - self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') - self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy + self.exploration_policy = eval('ep.' + tuning_parameters.exploration.policy + '(tuning_parameters)') + self.evaluation_exploration_policy = eval('ep.' + tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') - self.evaluation_exploration_policy.change_phase(RunPhase.TEST) + self.evaluation_exploration_policy.change_phase(utils.RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters @@ -100,30 +95,30 @@ class Agent(object): self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] - logger.set_current_time(self.current_episode) + logger.logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] - self.renderer = Renderer() + self.renderer = renderer.Renderer() # signals self.signals = [] - self.loss = Signal('Loss') + self.loss = utils.Signal('Loss') self.signals.append(self.loss) - self.curr_learning_rate = Signal('Learning Rate') + self.curr_learning_rate = utils.Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: - self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,)) - self.running_reward_stats = RunningStat(()) + self.running_observation_stats = utils.RunningStat((self.tp.env.desired_observation_width,)) + self.running_reward_stats = utils.RunningStat(()) else: - self.running_observation_stats = SharedRunningStats(self.tp, replicated_device, - shape=(self.tp.env.desired_observation_width,), - name='observation_stats') - self.running_reward_stats = SharedRunningStats(self.tp, replicated_device, - shape=(), - name='reward_stats') + self.running_observation_stats = sv.SharedRunningStats(self.tp, replicated_device, + shape=(self.tp.env.desired_observation_width,), + name='observation_stats') + self.running_reward_stats = sv.SharedRunningStats(self.tp, replicated_device, + shape=(), + name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done @@ -137,13 +132,13 @@ class Agent(object): def log_to_screen(self, phase): # log to screen if self.current_episode >= 0: - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: exploration = self.exploration_policy.get_control_param() else: exploration = self.evaluation_exploration_policy.get_control_param() - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), @@ -154,37 +149,37 @@ class Agent(object): prefix=phase ) - def update_log(self, phase=RunPhase.TRAIN): + def update_log(self, phase=utils.RunPhase.TRAIN): """ Writes logging messages to screen and updates the log file with all the signal values. :return: None """ # log all the signals to file - logger.set_current_time(self.current_episode) - logger.create_signal_value('Training Iter', self.training_iteration) - logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP)) - logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) - logger.create_signal_value('ER #Episodes', self.memory.length()) - logger.create_signal_value('Episode Length', self.current_episode_steps_counter) - logger.create_signal_value('Total steps', self.total_steps_counter) - logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) - logger.create_signal_value("Training Reward", self.total_reward_in_current_episode - if phase == RunPhase.TRAIN else np.nan) - logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode - if phase == RunPhase.TEST else np.nan) - logger.create_signal_value('Update Target Network', 0, overwrite=False) - logger.update_wall_clock_time(self.current_episode) + logger.logger.set_current_time(self.current_episode) + logger.logger.create_signal_value('Training Iter', self.training_iteration) + logger.logger.create_signal_value('In Heatup', int(phase == utils.RunPhase.HEATUP)) + logger.logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) + logger.logger.create_signal_value('ER #Episodes', self.memory.length()) + logger.logger.create_signal_value('Episode Length', self.current_episode_steps_counter) + logger.logger.create_signal_value('Total steps', self.total_steps_counter) + logger.logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) + logger.logger.create_signal_value("Training Reward", self.total_reward_in_current_episode + if phase == utils.RunPhase.TRAIN else np.nan) + logger.logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode + if phase == utils.RunPhase.TEST else np.nan) + logger.logger.create_signal_value('Update Target Network', 0, overwrite=False) + logger.logger.update_wall_clock_time(self.current_episode) for signal in self.signals: - logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) - logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) - logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) - logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) + logger.logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) + logger.logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) + logger.logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) + logger.logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) # dump if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \ and self.current_episode > 0: - logger.dump_output_csv() + logger.logger.dump_output_csv() def reset_game(self, do_not_reset_env=False): """ @@ -211,7 +206,7 @@ class Agent(object): self.episode_running_info[action] = [] plt.clf() - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == configurations.MiddlewareTypes.LSTM: for network in self.networks: network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init @@ -281,9 +276,9 @@ class Agent(object): if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: for network in self.networks: network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target) - logger.create_signal_value('Update Target Network', 1) + logger.logger.create_signal_value('Update Target Network', 1) else: - logger.create_signal_value('Update Target Network', 0, overwrite=False) + logger.logger.create_signal_value('Update Target Network', 0, overwrite=False) return loss @@ -321,7 +316,7 @@ class Agent(object): plt.legend() plt.pause(0.00000001) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): """ choose an action to act with in the current episode being played. Different behavior might be exhibited when training or testing. @@ -351,15 +346,15 @@ class Agent(object): for input_name in self.tp.agent.input_types.keys(): input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0) return input_state - + def prepare_initial_state(self): """ Create an initial state when starting a new episode :return: None """ observation = self.preprocess_observation(self.env.state['observation']) - self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) - observation = LazyStack(self.curr_stack, -1) + self.curr_stack = collections.deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) + observation = utils.LazyStack(self.curr_stack, -1) self.curr_state = { 'observation': observation @@ -369,21 +364,21 @@ class Agent(object): if self.tp.agent.use_accumulated_reward_as_measurement: self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) - def act(self, phase=RunPhase.TRAIN): + def act(self, phase=utils.RunPhase.TRAIN): """ Take one step in the environment according to the network prediction and store the transition in memory :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored :return: A boolean value that signals an episode termination """ - if phase != RunPhase.TEST: + if phase != utils.RunPhase.TEST: self.total_steps_counter += 1 self.current_episode_steps_counter += 1 # get new action action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0} - if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: + if phase == utils.RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: action = self.env.get_random_action() else: action, action_info = self.choose_action(self.curr_state, phase=phase) @@ -402,13 +397,13 @@ class Agent(object): next_state['observation'] = self.preprocess_observation(next_state['observation']) # plot action values online - if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: + if self.tp.visualization.plot_action_values_online and phase != utils.RunPhase.HEATUP: self.plot_action_values_online() # initialize the next state # TODO: provide option to stack more than just the observation self.curr_stack.append(next_state['observation']) - observation = LazyStack(self.curr_stack, -1) + observation = utils.LazyStack(self.curr_stack, -1) next_state['observation'] = observation if self.tp.agent.use_measurements and 'measurements' in result.keys(): @@ -417,14 +412,14 @@ class Agent(object): next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) # store the transition only if we are training - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: - transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) + if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP: + transition = memory.Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) for key in action_info.keys(): transition.info[key] = action_info[key] if self.tp.agent.add_a_normalized_timestep_to_the_observation: transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit self.memory.store(transition) - elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs: + elif phase == utils.RunPhase.TEST and self.tp.visualization.dump_gifs: # we store the transitions only for saving gifs self.last_episode_images.append(self.env.get_rendered_image()) @@ -437,7 +432,7 @@ class Agent(object): self.update_log(phase=phase) self.log_to_screen(phase=phase) - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: + if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP: self.reset_game() self.current_episode += 1 @@ -456,8 +451,8 @@ class Agent(object): max_reward_achieved = -float('inf') average_evaluation_reward = 0 - screen.log_title("Running evaluation") - self.env.change_phase(RunPhase.TEST) + logger.screen.log_title("Running evaluation") + self.env.change_phase(utils.RunPhase.TEST) for i in range(num_episodes): # keep the online network in sync with the global network if keep_networks_synced: @@ -466,7 +461,7 @@ class Agent(object): episode_ended = False while not episode_ended: - episode_ended = self.act(phase=RunPhase.TEST) + episode_ended = self.act(phase=utils.RunPhase.TEST) if keep_networks_synced \ and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps: @@ -477,7 +472,7 @@ class Agent(object): max_reward_achieved = self.total_reward_in_current_episode frame_skipping = int(5/self.tp.env.frame_skip) if self.tp.visualization.dump_gifs: - logger.create_gif(self.last_episode_images[::frame_skipping], + logger.logger.create_gif(self.last_episode_images[::frame_skipping], name='score-{}'.format(max_reward_achieved), fps=10) average_evaluation_reward += self.total_reward_in_current_episode @@ -485,8 +480,8 @@ class Agent(object): average_evaluation_reward /= float(num_episodes) - self.env.change_phase(RunPhase.TRAIN) - screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward)) + self.env.change_phase(utils.RunPhase.TRAIN) + logger.screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward)) def post_training_commands(self): pass @@ -505,15 +500,15 @@ class Agent(object): # heatup phase if self.tp.num_heatup_steps != 0: self.in_heatup = True - screen.log_title("Starting heatup {}".format(self.task_id)) + logger.screen.log_title("Starting heatup {}".format(self.task_id)) num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)): - self.act(phase=RunPhase.HEATUP) + self.act(phase=utils.RunPhase.HEATUP) # training phase self.in_heatup = False - screen.log_title("Starting training {}".format(self.task_id)) - self.exploration_policy.change_phase(RunPhase.TRAIN) + logger.screen.log_title("Starting training {}".format(self.task_id)) + self.exploration_policy.change_phase(utils.RunPhase.TRAIN) training_start_time = time.time() model_snapshots_periods_passed = -1 self.reset_game() @@ -557,7 +552,7 @@ class Agent(object): self.loss.add_sample(loss) self.training_iteration += 1 if self.imitation: - self.log_to_screen(RunPhase.TRAIN) + self.log_to_screen(utils.RunPhase.TRAIN) self.post_training_commands() def save_model(self, model_id): diff --git a/agents/bc_agent.py b/agents/bc_agent.py index 70fe3e6..af01720 100644 --- a/agents/bc_agent.py +++ b/agents/bc_agent.py @@ -13,16 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from agents.imitation_agent import ImitationAgent +from agents import imitation_agent # Behavioral Cloning Agent -class BCAgent(ImitationAgent): +class BCAgent(imitation_agent.ImitationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + imitation_agent.ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, _, actions, _, _, _ = self.extract_batch(batch) diff --git a/agents/bootstrapped_dqn_agent.py b/agents/bootstrapped_dqn_agent.py index 3476022..41aea9f 100644 --- a/agents/bootstrapped_dqn_agent.py +++ b/agents/bootstrapped_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * - +from agents import value_optimization_agent as voa +import utils # Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf -class BootstrappedDQNAgent(ValueOptimizationAgent): +class BootstrappedDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def reset_game(self, do_not_reset_env=False): - ValueOptimizationAgent.reset_game(self, do_not_reset_env) + voa.ValueOptimizationAgent.reset_game(self, do_not_reset_env) self.exploration_policy.select_head() def learn_from_batch(self, batch): @@ -51,8 +52,8 @@ class BootstrappedDQNAgent(ValueOptimizationAgent): return total_loss - def act(self, phase=RunPhase.TRAIN): - ValueOptimizationAgent.act(self, phase) + def act(self, phase=utils.RunPhase.TRAIN): + voa.ValueOptimizationAgent.act(self, phase) mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability, self.tp.exploration.architecture_num_q_heads) self.memory.update_last_transition_info({'mask': mask}) diff --git a/agents/categorical_dqn_agent.py b/agents/categorical_dqn_agent.py index dec8ba2..8e442fc 100644 --- a/agents/categorical_dqn_agent.py +++ b/agents/categorical_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class CategoricalDQNAgent(ValueOptimizationAgent): +class CategoricalDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) @@ -57,4 +58,3 @@ class CategoricalDQNAgent(ValueOptimizationAgent): total_loss = result[0] return total_loss - diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py index 88a70b0..07343c8 100644 --- a/agents/clipped_ppo_agent.py +++ b/agents/clipped_ppo_agent.py @@ -13,27 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from agents.actor_critic_agent import * +import collections +import copy from random import shuffle +import numpy as np + +from agents import actor_critic_agent as aca +from agents import policy_optimization_agent as poa +import logger +import utils + # Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 -class ClippedPPOAgent(ActorCriticAgent): +class ClippedPPOAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) # signals definition - self.value_loss = Signal('Value Loss') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.policy_loss) self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') + self.unclipped_grads = utils.Signal('Grads (unclipped)') self.signals.append(self.unclipped_grads) - self.value_targets = Signal('Value Targets') + self.value_targets = utils.Signal('Value Targets') self.signals.append(self.value_targets) - self.kl_divergence = Signal('KL Divergence') + self.kl_divergence = utils.Signal('KL Divergence') self.signals.append(self.kl_divergence) def fill_advantages(self, batch): @@ -46,9 +53,9 @@ class ClippedPPOAgent(ActorCriticAgent): # calculate advantages advantages = [] value_targets = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) @@ -66,7 +73,7 @@ class ClippedPPOAgent(ActorCriticAgent): advantages = np.append(advantages, rollout_advantages) value_targets = np.append(value_targets, gae_based_value_targets) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) @@ -144,8 +151,8 @@ class ClippedPPOAgent(ActorCriticAgent): curr_learning_rate = self.tp.learning_rate # log training parameters - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), @@ -184,13 +191,13 @@ class ClippedPPOAgent(ActorCriticAgent): self.update_log() # should be done in order to update the data that has been accumulated * while not playing * return np.append(losses[0], losses[1]) - def choose_action(self, current_state, phase=RunPhase.TRAIN): + def choose_action(self, current_state, phase=utils.RunPhase.TRAIN): if self.env.discrete_controls: # DISCRETE _, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state)) action_values = action_values.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -201,7 +208,7 @@ class ClippedPPOAgent(ActorCriticAgent): _, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state)) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) # if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5: # print action diff --git a/agents/ddpg_agent.py b/agents/ddpg_agent.py index 425f1de..df05395 100644 --- a/agents/ddpg_agent.py +++ b/agents/ddpg_agent.py @@ -13,28 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import copy -from agents.actor_critic_agent import * -from configurations import * +import numpy as np + +from agents import actor_critic_agent as aca +from agents import agent +from architectures import network_wrapper as nw +import configurations as conf +import utils # Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf -class DDPGAgent(ActorCriticAgent): +class DDPGAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) # define critic network self.critic_network = self.main_network # self.networks.append(self.critic_network) # define actor network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.Pi] - self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor', - self.replicated_device, self.worker_device) + tuning_parameters.agent.input_types = {'observation': conf.InputTypes.Observation} + tuning_parameters.agent.output_types = [conf.OutputTypes.Pi] + self.actor_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'actor', + self.replicated_device, self.worker_device) self.networks.append(self.actor_network) - self.q_values = Signal("Q") + self.q_values = utils.Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True) @@ -82,14 +88,14 @@ class DDPGAgent(ActorCriticAgent): return total_loss def train(self): - return Agent.train(self) + return agent.Agent.train(self) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): assert not self.env.discrete_controls, 'DDPG works only for continuous control problems' result = self.actor_network.online_network.predict(self.tf_input_state(curr_state)) action_values = result[0].squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/ddqn_agent.py b/agents/ddqn_agent.py index 838ae3f..9f19e8a 100644 --- a/agents/ddqn_agent.py +++ b/agents/ddqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Double DQN - https://arxiv.org/abs/1509.06461 -class DDQNAgent(ValueOptimizationAgent): +class DDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) diff --git a/agents/dfp_agent.py b/agents/dfp_agent.py index c055d2c..f778fff 100644 --- a/agents/dfp_agent.py +++ b/agents/dfp_agent.py @@ -13,17 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.agent import * +from agents import agent +from architectures import network_wrapper as nw +import utils # Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf -class DFPAgent(Agent): +class DFPAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.current_goal = self.tp.agent.goal_vector - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) + self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) def learn_from_batch(self, batch): @@ -45,7 +48,7 @@ class DFPAgent(Agent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network observation = np.expand_dims(np.array(curr_state['observation']), 0) measurements = np.expand_dims(np.array(curr_state['measurements']), 0) @@ -66,7 +69,7 @@ class DFPAgent(Agent): self.tp.agent.future_measurements_weights) # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) diff --git a/agents/distributional_dqn_agent.py b/agents/distributional_dqn_agent.py index d7c0088..33e7a5e 100644 --- a/agents/distributional_dqn_agent.py +++ b/agents/distributional_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class DistributionalDQNAgent(ValueOptimizationAgent): +class DistributionalDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) @@ -57,4 +58,3 @@ class DistributionalDQNAgent(ValueOptimizationAgent): total_loss = result[0] return total_loss - diff --git a/agents/dqn_agent.py b/agents/dqn_agent.py index 70c0c7d..8660def 100644 --- a/agents/dqn_agent.py +++ b/agents/dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf -class DQNAgent(ValueOptimizationAgent): +class DQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) diff --git a/agents/human_agent.py b/agents/human_agent.py index c75c2a2..7f8e491 100644 --- a/agents/human_agent.py +++ b/agents/human_agent.py @@ -13,31 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections +import os -from agents.agent import * import pygame +from pandas.io import pickle + +from agents import agent +import logger +import utils -class HumanAgent(Agent): +class HumanAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.clock = pygame.time.Clock() self.max_fps = int(self.tp.visualization.max_fps_for_human_control) - screen.log_title("Human Control Mode") + utils.screen.log_title("Human Control Mode") available_keys = self.env.get_available_keys() if available_keys: - screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") - screen.log("") + utils.screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") + utils.screen.log("") for action, key in self.env.get_available_keys(): - screen.log("\t- {}: {}".format(action, key)) - screen.separator() + utils.screen.log("\t- {}: {}".format(action, key)) + utils.screen.separator() def train(self): return 0 - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): action = self.env.get_action_from_user() # keep constant fps @@ -49,16 +55,16 @@ class HumanAgent(Agent): return action, {"action_value": 0} def save_replay_buffer_and_exit(self): - replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p') + replay_buffer_path = os.path.join(logger.logger.experiments_path, 'replay_buffer.p') self.memory.tp = None - to_pickle(self.memory, replay_buffer_path) - screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path)) + pickle.to_pickle(self.memory, replay_buffer_path) + utils.screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path)) exit() def log_to_screen(self, phase): - # log to screen - screen.log_dict( - OrderedDict([ + # log to utils.screen + utils.screen.log_dict( + collections.OrderedDict([ ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), ("steps", self.total_steps_counter) diff --git a/agents/imitation_agent.py b/agents/imitation_agent.py index f893fbe..522c569 100644 --- a/agents/imitation_agent.py +++ b/agents/imitation_agent.py @@ -13,23 +13,27 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections -from agents.agent import * +from agents import agent +from architectures import network_wrapper as nw +import utils +import logging # Imitation Agent -class ImitationAgent(Agent): +class ImitationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.imitation = True def extract_action_values(self, prediction): return prediction.squeeze() - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state)) @@ -49,10 +53,10 @@ class ImitationAgent(Agent): def log_to_screen(self, phase): # log to screen - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: # for the training phase - we log during the episode to visualize the progress in training - screen.log_dict( - OrderedDict([ + logging.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("Loss", self.loss.values[-1]), @@ -62,4 +66,4 @@ class ImitationAgent(Agent): ) else: # for the evaluation phase - logging as in regular RL - Agent.log_to_screen(self, phase) + agent.Agent.log_to_screen(self, phase) diff --git a/agents/mmc_agent.py b/agents/mmc_agent.py index 2b5a2cb..4473b06 100644 --- a/agents/mmc_agent.py +++ b/agents/mmc_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa -class MixedMonteCarloAgent(ValueOptimizationAgent): +class MixedMonteCarloAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate def learn_from_batch(self, batch): diff --git a/agents/n_step_q_agent.py b/agents/n_step_q_agent.py index 5a74fb5..a5b773c 100644 --- a/agents/n_step_q_agent.py +++ b/agents/n_step_q_agent.py @@ -14,22 +14,21 @@ # limitations under the License. # import numpy as np -import scipy.signal -from agents.value_optimization_agent import ValueOptimizationAgent -from agents.policy_optimization_agent import PolicyOptimizationAgent -from logger import logger -from utils import Signal, last_sample +from agents import value_optimization_agent as voa +from agents import policy_optimization_agent as poa +import logger +import utils # N Step Q Learning Agent - https://arxiv.org/abs/1602.01783 -class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): +class NStepQAgent(voa.ValueOptimizationAgent, poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) self.last_gradient_update_step_idx = 0 - self.q_values = Signal('Q Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') + self.q_values = utils.Signal('Q Values') + self.unclipped_grads = utils.Signal('Grads (unclipped)') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.q_values) self.signals.append(self.unclipped_grads) self.signals.append(self.value_loss) @@ -57,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): if game_overs[-1]: R = 0 else: - R = np.max(self.main_network.target_network.predict(last_sample(next_states))) + R = np.max(self.main_network.target_network.predict(utils.last_sample(next_states))) for i in reversed(range(num_transitions)): R = rewards[i] + self.tp.agent.discount * R @@ -85,4 +84,4 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): else: logger.create_signal_value('Update Target Network', 0, overwrite=False) - return PolicyOptimizationAgent.train(self) + return poa.PolicyOptimizationAgent.train(self) diff --git a/agents/naf_agent.py b/agents/naf_agent.py index 65ca83c..35072f7 100644 --- a/agents/naf_agent.py +++ b/agents/naf_agent.py @@ -13,21 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np from agents.value_optimization_agent import ValueOptimizationAgent -from utils import RunPhase, Signal +import utils # Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf class NAFAgent(ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.l_values = Signal("L") - self.a_values = Signal("Advantage") - self.mu_values = Signal("Action") - self.v_values = Signal("V") + self.l_values = utils.Signal("L") + self.a_values = utils.Signal("Advantage") + self.mu_values = utils.Signal("Action") + self.v_values = utils.Signal("V") self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values] def learn_from_batch(self, batch): @@ -49,7 +48,7 @@ class NAFAgent(ValueOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): assert not self.env.discrete_controls, 'NAF works only for continuous control problems' # convert to batch so we can run it through the network @@ -60,7 +59,7 @@ class NAFAgent(ValueOptimizationAgent): outputs=naf_head.mu, squeeze_output=False, ) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/nec_agent.py b/agents/nec_agent.py index 77520a4..47aa33f 100644 --- a/agents/nec_agent.py +++ b/agents/nec_agent.py @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import numpy as np - -from agents.value_optimization_agent import ValueOptimizationAgent +from agents import value_optimization_agent as voa from logger import screen -from utils import RunPhase +import utils # Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf -class NECAgent(ValueOptimizationAgent): +class NECAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=False) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=False) self.current_episode_state_embeddings = [] self.training_started = False @@ -52,7 +49,7 @@ class NECAgent(ValueOptimizationAgent): return total_loss - def act(self, phase=RunPhase.TRAIN): + def act(self, phase=utils.RunPhase.TRAIN): if self.in_heatup: # get embedding in heatup (otherwise we get it through choose_action) embedding = self.main_network.online_network.predict( diff --git a/agents/pal_agent.py b/agents/pal_agent.py index 68ff675..9a11e00 100644 --- a/agents/pal_agent.py +++ b/agents/pal_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf -class PALAgent(ValueOptimizationAgent): +class PALAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.alpha = tuning_parameters.agent.pal_alpha self.persistent = tuning_parameters.agent.persistent_advantage_learning self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate diff --git a/agents/policy_gradients_agent.py b/agents/policy_gradients_agent.py index 3a592d1..037dc88 100644 --- a/agents/policy_gradients_agent.py +++ b/agents/policy_gradients_agent.py @@ -13,25 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from agents.policy_optimization_agent import * import numpy as np -from logger import * -import tensorflow as tf -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") -from utils import * +from agents import policy_optimization_agent as poa +import logger +import utils -class PolicyGradientsAgent(PolicyOptimizationAgent): +class PolicyGradientsAgent(poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.returns_mean = Signal('Returns Mean') - self.returns_variance = Signal('Returns Variance') + poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.returns_mean = utils.Signal('Returns Mean') + self.returns_variance = utils.Signal('Returns Variance') self.signals.append(self.returns_mean) self.signals.append(self.returns_variance) self.last_gradient_update_step_idx = 0 @@ -41,21 +34,21 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch) for i in reversed(range(len(total_returns))): - if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.TOTAL_RETURN: total_returns[i] = total_returns[0] - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN: # just take the total return as it is pass - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE: # we can get a single transition episode while playing Doom Basic, causing the std to be 0 if self.std_discounted_return != 0: total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return else: total_returns[i] = 0 - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP: total_returns[i] -= self.mean_return_over_multiple_episodes[i] else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") targets = total_returns if not self.env.discrete_controls and len(actions.shape) < 2: @@ -69,12 +62,12 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network if self.env.discrete_controls: # DISCRETE action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -84,7 +77,7 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): # CONTINUOUS result = self.main_network.online_network.predict(self.tf_input_state(curr_state)) action_values = result[0].squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/policy_optimization_agent.py b/agents/policy_optimization_agent.py index be23760..175ca7f 100644 --- a/agents/policy_optimization_agent.py +++ b/agents/policy_optimization_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,12 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections -from agents.agent import * -from memories.memory import Episode +import numpy as np + +from agents import agent +from architectures import network_wrapper as nw +import logger +import utils -class PolicyGradientRescaler(Enum): +class PolicyGradientRescaler(utils.Enum): TOTAL_RETURN = 0 FUTURE_RETURN = 1 FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2 @@ -30,11 +35,11 @@ class PolicyGradientRescaler(Enum): GAE = 8 -class PolicyOptimizationAgent(Agent): +class PolicyOptimizationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler) @@ -44,7 +49,7 @@ class PolicyOptimizationAgent(Agent): self.max_episode_length = 100000 self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length) self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length) - self.entropy = Signal('Entropy') + self.entropy = utils.Signal('Entropy') self.signals.append(self.entropy) self.reset_game(do_not_reset_env=True) @@ -52,8 +57,8 @@ class PolicyOptimizationAgent(Agent): def log_to_screen(self, phase): # log to screen if self.current_episode > 0: - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py index 4a37e69..35d1b98 100644 --- a/agents/ppo_agent.py +++ b/agents/ppo_agent.py @@ -13,36 +13,44 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections +import copy -from agents.actor_critic_agent import * -from random import shuffle +import numpy as np + +from agents import actor_critic_agent as aca +from agents import policy_optimization_agent as poa +from architectures import network_wrapper as nw +import configurations +import logger +import utils # Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf -class PPOAgent(ActorCriticAgent): +class PPOAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) self.critic_network = self.main_network # define the policy network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.PPO] + tuning_parameters.agent.input_types = {'observation': configurations.InputTypes.Observation} + tuning_parameters.agent.output_types = [configurations.OutputTypes.PPO] tuning_parameters.agent.optimizer_type = 'Adam' tuning_parameters.agent.l2_regularization = 0 - self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy', - self.replicated_device, self.worker_device) + self.policy_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'policy', + self.replicated_device, self.worker_device) self.networks.append(self.policy_network) # signals definition - self.value_loss = Signal('Value Loss') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.policy_loss) - self.kl_divergence = Signal('KL Divergence') + self.kl_divergence = utils.Signal('KL Divergence') self.signals.append(self.kl_divergence) self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') + self.unclipped_grads = utils.Signal('Grads (unclipped)') self.signals.append(self.unclipped_grads) self.reset_game(do_not_reset_env=True) @@ -57,9 +65,9 @@ class PPOAgent(ActorCriticAgent): # calculate advantages advantages = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) @@ -76,7 +84,7 @@ class PPOAgent(ActorCriticAgent): episode_start_idx = idx + 1 advantages = np.append(advantages, rollout_advantages) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) @@ -107,7 +115,7 @@ class PPOAgent(ActorCriticAgent): for k, v in current_states.items() } total_return_batch = total_return[i * batch_size:(i + 1) * batch_size] - old_policy_values = force_list(self.critic_network.target_network.predict( + old_policy_values = utils.force_list(self.critic_network.target_network.predict( current_states_batch).squeeze()) if self.critic_network.online_network.optimizer_type != 'LBFGS': targets = total_return_batch @@ -155,7 +163,7 @@ class PPOAgent(ActorCriticAgent): actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution - old_policy = force_list(self.policy_network.target_network.predict(current_states)) + old_policy = utils.force_list(self.policy_network.target_network.predict(current_states)) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [self.policy_network.online_network.output_heads[0].kl_divergence, @@ -196,8 +204,8 @@ class PPOAgent(ActorCriticAgent): curr_learning_rate = self.tp.learning_rate # log training parameters - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), @@ -215,7 +223,7 @@ class PPOAgent(ActorCriticAgent): def update_kl_coefficient(self): # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow # his implementation for now because we know it works well - screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process)) + logger.screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process)) # update kl coefficient kl_target = self.tp.agent.target_kl_divergence @@ -236,7 +244,7 @@ class PPOAgent(ActorCriticAgent): new_kl_coefficient, self.policy_network.online_network.output_heads[0].kl_coefficient_ph) - screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) + logger.screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) def post_training_commands(self): if self.tp.agent.use_kl_regularization: @@ -264,12 +272,12 @@ class PPOAgent(ActorCriticAgent): self.update_log() # should be done in order to update the data that has been accumulated * while not playing * return np.append(value_loss, policy_loss) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): if self.env.discrete_controls: # DISCRETE action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -280,7 +288,7 @@ class PPOAgent(ActorCriticAgent): action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state)) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) else: action = action_values_mean diff --git a/agents/qr_dqn_agent.py b/agents/qr_dqn_agent.py index 8888d18..c36b861 100644 --- a/agents/qr_dqn_agent.py +++ b/agents/qr_dqn_agent.py @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf -class QuantileRegressionDQNAgent(ValueOptimizationAgent): +class QuantileRegressionDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) diff --git a/agents/value_optimization_agent.py b/agents/value_optimization_agent.py index 75708d7..d91a98e 100644 --- a/agents/value_optimization_agent.py +++ b/agents/value_optimization_agent.py @@ -13,21 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from agents.agent import Agent -from architectures.network_wrapper import NetworkWrapper -from utils import RunPhase, Signal +from agents import agent +from architectures import network_wrapper as nw +import utils -class ValueOptimizationAgent(Agent): +class ValueOptimizationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) - self.q_values = Signal("Q") + self.q_values = utils.Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True) @@ -47,12 +46,12 @@ class ValueOptimizationAgent(Agent): 'require exploration policies which return a single action.' ).format(policy.__class__.__name__)) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): prediction = self.get_prediction(curr_state) actions_q_values = self.get_q_values(prediction) # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: exploration_policy = self.exploration_policy else: exploration_policy = self.evaluation_exploration_policy diff --git a/architectures/__init__.py b/architectures/__init__.py index cbf2ac5..e72fb00 100644 --- a/architectures/__init__.py +++ b/architectures/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from architectures.architecture import * -from logger import failed_imports -try: - from architectures.tensorflow_components.general_network import * - from architectures.tensorflow_components.architecture import * -except ImportError: - failed_imports.append("TensorFlow") +import logger try: - from architectures.neon_components.general_network import * - from architectures.neon_components.architecture import * + from architectures.tensorflow_components import general_network as ts_gn + from architectures.tensorflow_components import architecture as ts_arch except ImportError: - failed_imports.append("Neon") + logger.failed_imports.append("TensorFlow") -from architectures.network_wrapper import * \ No newline at end of file +try: + from architectures.neon_components import general_network as neon_gn + from architectures.neon_components import architecture as neon_arch +except ImportError: + logger.failed_imports.append("Neon") diff --git a/architectures/architecture.py b/architectures/architecture.py index d3175b7..03c48d8 100644 --- a/architectures/architecture.py +++ b/architectures/architecture.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,6 @@ # limitations under the License. # -from configurations import Preset - class Architecture(object): def __init__(self, tuning_parameters, name=""): @@ -73,4 +71,4 @@ class Architecture(object): pass def set_variable_value(self, assign_op, value, placeholder=None): - pass \ No newline at end of file + pass diff --git a/architectures/neon_components/architecture.py b/architectures/neon_components/architecture.py index de600c1..1577ed8 100644 --- a/architectures/neon_components/architecture.py +++ b/architectures/neon_components/architecture.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import sys -import copy -from ngraph.frontends.neon import * import ngraph as ng -from architectures.architecture import * import numpy as np -from utils import * + +from architectures import architecture +import utils -class NeonArchitecture(Architecture): +class NeonArchitecture(architecture.Architecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - Architecture.__init__(self, tuning_parameters, name) + architecture.Architecture.__init__(self, tuning_parameters, name) assert tuning_parameters.agent.neon_support, 'Neon is not supported for this agent' self.clip_error = tuning_parameters.clip_gradients self.total_loss = None @@ -113,8 +110,8 @@ class NeonArchitecture(Architecture): def accumulate_gradients(self, inputs, targets): # Neon doesn't currently allow separating the grads calculation and grad apply operations # so this feature is not currently available. instead we do a full training iteration - inputs = force_list(inputs) - targets = force_list(targets) + inputs = utils.force_list(inputs) + targets = utils.force_list(targets) for idx, input in enumerate(inputs): inputs[idx] = input.swapaxes(0, -1) diff --git a/architectures/neon_components/embedders.py b/architectures/neon_components/embedders.py index 5f594a3..9d20a9d 100644 --- a/architectures/neon_components/embedders.py +++ b/architectures/neon_components/embedders.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import ngraph.frontends.neon as neon import ngraph as ng -from ngraph.util.names import name_scope +import ngraph.frontends.neon as neon +import ngraph.util.names as ngraph_names class InputEmbedder(object): @@ -31,7 +30,7 @@ class InputEmbedder(object): self.output = None def __call__(self, prev_input_placeholder=None): - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): # create the input axes axes = [] if len(self.input_size) == 2: diff --git a/architectures/neon_components/general_network.py b/architectures/neon_components/general_network.py index 99ac6e9..f837f71 100644 --- a/architectures/neon_components/general_network.py +++ b/architectures/neon_components/general_network.py @@ -13,15 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import ngraph as ng +from ngraph.frontends import neon +from ngraph.util import names as ngraph_names -from architectures.neon_components.embedders import * -from architectures.neon_components.heads import * -from architectures.neon_components.middleware import * -from architectures.neon_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes +from architectures.neon_components import architecture +from architectures.neon_components import embedders +from architectures.neon_components import middleware +from architectures.neon_components import heads +import configurations as conf -class GeneralNeonNetwork(NeonArchitecture): +class GeneralNeonNetwork(architecture.NeonArchitecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): self.global_network = global_network self.network_is_local = network_is_local @@ -34,7 +37,7 @@ class GeneralNeonNetwork(NeonArchitecture): self.activation_function = self.get_activation_function( tuning_parameters.agent.hidden_layers_activation_function) - NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) + architecture.NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) def get_activation_function(self, activation_function_string): activation_functions = { @@ -53,36 +56,36 @@ class GeneralNeonNetwork(NeonArchitecture): # the observation can be either an image or a vector def get_observation_embedding(with_timestep=False): if self.input_height > 1: - return ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size, - name="observation") + return embedders.ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size, + name="observation") else: - return VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size, - name="observation") + return embedders.VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size, + name="observation") input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), self.batch_size, name="action"), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), + conf.InputTypes.Observation: get_observation_embedding(), + conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"), + conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"), + conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), self.batch_size, name="action"), + conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), } return input_mapping[embedder_type] def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) + return {conf.MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach + conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function) def get_output_head(self, head_type, head_idx, loss_weight=1.): output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach - OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach - OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach - OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach - OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach + conf.OutputTypes.Q: heads.QHead, + conf.OutputTypes.DuelingQ: heads.DuelingQHead, + conf.OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach + conf.OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach + conf.OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach + conf.OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach + conf.OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach + conf.OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach + conf.OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach } return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) @@ -104,7 +107,7 @@ class GeneralNeonNetwork(NeonArchitecture): done_creating_input_placeholders = False for network_idx in range(self.num_networks): - with name_scope('network_{}'.format(network_idx)): + with ngraph_names.name_scope('network_{}'.format(network_idx)): #################### # Input Embeddings # #################### diff --git a/architectures/neon_components/heads.py b/architectures/neon_components/heads.py index df49867..21af758 100644 --- a/architectures/neon_components/heads.py +++ b/architectures/neon_components/heads.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import ngraph as ng -from ngraph.util.names import name_scope -import ngraph.frontends.neon as neon -import numpy as np -from utils import force_list -from architectures.neon_components.losses import * +from ngraph.frontends import neon +from ngraph.util import names as ngraph_names + +import utils +from architectures.neon_components import losses class Head(object): @@ -30,7 +29,7 @@ class Head(object): self.loss = [] self.loss_type = [] self.regularizations = [] - self.loss_weight = force_list(loss_weight) + self.loss_weight = utils.force_list(loss_weight) self.weights_init = neon.GlorotInit() self.biases_init = neon.ConstantInit() self.target = [] @@ -44,15 +43,15 @@ class Head(object): :param input_layer: the input to the graph :return: the output of the last layer and the target placeholder """ - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): self._build_module(input_layer) - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) + self.output = utils.force_list(self.output) + self.target = utils.force_list(self.target) + self.input = utils.force_list(self.input) + self.loss_type = utils.force_list(self.loss_type) + self.loss = utils.force_list(self.loss) + self.regularizations = utils.force_list(self.regularizations) if self.is_local: self.set_loss() @@ -106,7 +105,7 @@ class QHead(Head): if tuning_parameters.agent.replace_mse_with_huber_loss: raise Exception("huber loss is not supported in neon") else: - self.loss_type = mean_squared_error + self.loss_type = losses.mean_squared_error def _build_module(self, input_layer): # Standard Q Network @@ -159,7 +158,7 @@ class MeasurementsPredictionHead(Head): if tuning_parameters.agent.replace_mse_with_huber_loss: raise Exception("huber loss is not supported in neon") else: - self.loss_type = mean_squared_error + self.loss_type = losses.mean_squared_error def _build_module(self, input_layer): # This is almost exactly the same as Dueling Network but we predict the future measurements for each action @@ -167,7 +166,7 @@ class MeasurementsPredictionHead(Head): multistep_measurements_size = self.measurements_size[0] * self.num_predicted_steps_ahead # actions expectation tower (expectation stream) - E - with name_scope("expectation_stream"): + with ngraph_names.name_scope("expectation_stream"): expectation_stream = neon.Sequential([ neon.Affine(nout=256, activation=neon.Rectlin(), weight_init=self.weights_init, bias_init=self.biases_init), @@ -176,7 +175,7 @@ class MeasurementsPredictionHead(Head): ])(input_layer) # action fine differences tower (action stream) - A - with name_scope("action_stream"): + with ngraph_names.name_scope("action_stream"): action_stream_unnormalized = neon.Sequential([ neon.Affine(nout=256, activation=neon.Rectlin(), weight_init=self.weights_init, bias_init=self.biases_init), @@ -191,4 +190,3 @@ class MeasurementsPredictionHead(Head): # merge to future measurements predictions self.output = repeated_expectation_stream + action_stream - diff --git a/architectures/neon_components/losses.py b/architectures/neon_components/losses.py index 26e8644..a6fc064 100644 --- a/architectures/neon_components/losses.py +++ b/architectures/neon_components/losses.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import ngraph as ng -import ngraph.frontends.neon as neon -from ngraph.util.names import name_scope -import numpy as np +from ngraph.util import names as ngraph_names def mean_squared_error(targets, outputs, weights=1.0, scope=""): - with name_scope(scope): + with ngraph_names.name_scope(scope): # TODO: reduce mean over the action axis loss = ng.squared_L2(targets - outputs) weighted_loss = loss * weights diff --git a/architectures/neon_components/middleware.py b/architectures/neon_components/middleware.py index 2aa02fd..fad7b9c 100644 --- a/architectures/neon_components/middleware.py +++ b/architectures/neon_components/middleware.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import ngraph as ng import ngraph.frontends.neon as neon -from ngraph.util.names import name_scope -import numpy as np +from ngraph.util import names as ngraph_names class MiddlewareEmbedder(object): @@ -30,7 +27,7 @@ class MiddlewareEmbedder(object): self.activation_function = activation_function def __call__(self, input_layer): - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): self.input = input_layer self._build_module() diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py index ef026e6..d21bd77 100644 --- a/architectures/network_wrapper.py +++ b/architectures/network_wrapper.py @@ -13,20 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os +import collections -from collections import OrderedDict -from configurations import Preset, Frameworks -from logger import * +import configurations as conf +import logger try: import tensorflow as tf - from architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork + from architectures.tensorflow_components import general_network as tf_net #import GeneralTensorFlowNetwork except ImportError: - failed_imports.append("TensorFlow") + logger.failed_imports.append("TensorFlow") try: - from architectures.neon_components.general_network import GeneralNeonNetwork + from architectures.neon_components import general_network as neon_net except ImportError: - failed_imports.append("Neon") + logger.failed_imports.append("Neon") class NetworkWrapper(object): @@ -50,12 +51,12 @@ class NetworkWrapper(object): self.name = name self.sess = tuning_parameters.sess - if self.tp.framework == Frameworks.TensorFlow: - general_network = GeneralTensorFlowNetwork - elif self.tp.framework == Frameworks.Neon: - general_network = GeneralNeonNetwork + if self.tp.framework == conf.Frameworks.TensorFlow: + general_network = tf_net.GeneralTensorFlowNetwork + elif self.tp.framework == conf.Frameworks.Neon: + general_network = neon_net.GeneralNeonNetwork else: - raise Exception("{} Framework is not supported".format(Frameworks().to_string(self.tp.framework))) + raise Exception("{} Framework is not supported".format(conf.Frameworks().to_string(self.tp.framework))) # Global network - the main network shared between threads self.global_network = None @@ -77,13 +78,13 @@ class NetworkWrapper(object): self.target_network = general_network(tuning_parameters, '{}/target'.format(name), network_is_local=True) - if not self.tp.distributed and self.tp.framework == Frameworks.TensorFlow: + if not self.tp.distributed and self.tp.framework == conf.Frameworks.TensorFlow: variables_to_restore = tf.global_variables() variables_to_restore = [v for v in variables_to_restore if '/online' in v.name] self.model_saver = tf.train.Saver(variables_to_restore) if self.tp.sess and self.tp.checkpoint_restore_dir: checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir) - screen.log_title("Loading checkpoint: {}".format(checkpoint)) + logger.screen.log_title("Loading checkpoint: {}".format(checkpoint)) self.model_saver.restore(self.tp.sess, checkpoint) self.update_target_network() @@ -178,8 +179,8 @@ class NetworkWrapper(object): def save_model(self, model_id): saved_model_path = self.model_saver.save(self.tp.sess, os.path.join(self.tp.save_model_dir, str(model_id) + '.ckpt')) - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Saving model", saved_model_path), ]), prefix="Checkpoint" diff --git a/architectures/tensorflow_components/architecture.py b/architectures/tensorflow_components/architecture.py index 3474ff4..41396b1 100644 --- a/architectures/tensorflow_components/architecture.py +++ b/architectures/tensorflow_components/architecture.py @@ -15,12 +15,11 @@ # import time -import numpy as np import tensorflow as tf -from architectures.architecture import Architecture -from utils import force_list, squeeze_list -from configurations import Preset, MiddlewareTypes +from architectures import architecture +import configurations as conf +import utils def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" @@ -37,14 +36,14 @@ def variable_summaries(var): tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) -class TensorFlowArchitecture(Architecture): +class TensorFlowArchitecture(architecture.Architecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): """ :param tuning_parameters: The parameters used for running the algorithm :type tuning_parameters: Preset :param name: The name of the network """ - Architecture.__init__(self, tuning_parameters, name) + architecture.Architecture.__init__(self, tuning_parameters, name) self.middleware_embedder = None self.network_is_local = network_is_local assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent' @@ -174,7 +173,7 @@ class TensorFlowArchitecture(Architecture): feed_dict = self._feed_dict(inputs) # feed targets - targets = force_list(targets) + targets = utils.force_list(targets) for placeholder_idx, target in enumerate(targets): feed_dict[self.targets[placeholder_idx]] = target @@ -186,13 +185,13 @@ class TensorFlowArchitecture(Architecture): else: fetches.append(self.tensor_gradients) fetches += [self.total_loss, self.losses] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: fetches.append(self.middleware_embedder.state_out) additional_fetches_start_idx = len(fetches) fetches += additional_fetches # feed the lstm state if necessary - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: # we can't always assume that we are starting from scratch here can we? feed_dict[self.middleware_embedder.c_in] = self.middleware_embedder.c_init feed_dict[self.middleware_embedder.h_in] = self.middleware_embedder.h_init @@ -206,7 +205,7 @@ class TensorFlowArchitecture(Architecture): # extract the fetches norm_unclipped_grads, grads, total_loss, losses = result[:4] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4] fetched_tensors = [] if len(additional_fetches) > 0: @@ -308,7 +307,7 @@ class TensorFlowArchitecture(Architecture): if outputs is None: outputs = self.outputs - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: feed_dict[self.middleware_embedder.c_in] = self.curr_rnn_c_in feed_dict[self.middleware_embedder.h_in] = self.curr_rnn_h_in @@ -317,7 +316,7 @@ class TensorFlowArchitecture(Architecture): output = self.tp.sess.run(outputs, feed_dict) if squeeze_output: - output = squeeze_list(output) + output = utils.squeeze_list(output) return output diff --git a/architectures/tensorflow_components/embedders.py b/architectures/tensorflow_components/embedders.py index 880de2f..b0a4ff1 100644 --- a/architectures/tensorflow_components/embedders.py +++ b/architectures/tensorflow_components/embedders.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf + from configurations import EmbedderComplexity diff --git a/architectures/tensorflow_components/general_network.py b/architectures/tensorflow_components/general_network.py index 03bb2a9..b9b4a78 100644 --- a/architectures/tensorflow_components/general_network.py +++ b/architectures/tensorflow_components/general_network.py @@ -13,15 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import tensorflow as tf -from architectures.tensorflow_components.embedders import * -from architectures.tensorflow_components.heads import * -from architectures.tensorflow_components.middleware import * -from architectures.tensorflow_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes +from architectures.tensorflow_components import architecture +from architectures.tensorflow_components import embedders +from architectures.tensorflow_components import middleware +from architectures.tensorflow_components import heads +import configurations as conf -class GeneralTensorFlowNetwork(TensorFlowArchitecture): +class GeneralTensorFlowNetwork(architecture.TensorFlowArchitecture): """ A generalized version of all possible networks implemented using tensorflow. """ @@ -37,7 +38,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): self.activation_function = self.get_activation_function( tuning_parameters.agent.hidden_layers_activation_function) - TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) + architecture.TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) def get_activation_function(self, activation_function_string): activation_functions = { @@ -56,37 +57,37 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): # the observation can be either an image or a vector def get_observation_embedding(with_timestep=False): if self.input_height > 1: - return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", - input_rescaler=self.tp.agent.input_rescaler) + return embedders.ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", + input_rescaler=self.tp.agent.input_rescaler) else: - return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") + return embedders.VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), name="action"), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), + conf.InputTypes.Observation: get_observation_embedding(), + conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, name="measurements"), + conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, name="goal_vector"), + conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), name="action"), + conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), } return input_mapping[embedder_type] def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: LSTM_Embedder, - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) + return {conf.MiddlewareTypes.LSTM: middleware.LSTM_Embedder, + conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function) def get_output_head(self, head_type, head_idx, loss_weight=1.): output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: VHead, - OutputTypes.Pi: PolicyHead, - OutputTypes.MeasurementsPrediction: MeasurementsPredictionHead, - OutputTypes.DNDQ: DNDQHead, - OutputTypes.NAF: NAFHead, - OutputTypes.PPO: PPOHead, - OutputTypes.PPO_V: PPOVHead, - OutputTypes.CategoricalQ: CategoricalQHead, - OutputTypes.QuantileRegressionQ: QuantileRegressionQHead + conf.OutputTypes.Q: heads.QHead, + conf.OutputTypes.DuelingQ: heads.DuelingQHead, + conf.OutputTypes.V: heads.VHead, + conf.OutputTypes.Pi: heads.PolicyHead, + conf.OutputTypes.MeasurementsPrediction: heads.MeasurementsPredictionHead, + conf.OutputTypes.DNDQ: heads.DNDQHead, + conf.OutputTypes.NAF: heads.NAFHead, + conf.OutputTypes.PPO: heads.PPOHead, + conf.OutputTypes.PPO_V: heads.PPOVHead, + conf.OutputTypes.CategoricalQ: heads.CategoricalQHead, + conf.OutputTypes.QuantileRegressionQ: heads.QuantileRegressionQHead } return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py index 616ab13..fdf1919 100644 --- a/architectures/tensorflow_components/heads.py +++ b/architectures/tensorflow_components/heads.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np -from utils import force_list + +import utils # Used to initialize weights for policy and value output layers @@ -36,7 +36,7 @@ class Head(object): self.loss = [] self.loss_type = [] self.regularizations = [] - self.loss_weight = force_list(loss_weight) + self.loss_weight = utils.force_list(loss_weight) self.target = [] self.input = [] self.is_local = is_local @@ -50,12 +50,12 @@ class Head(object): with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()): self._build_module(input_layer) - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) + self.output = utils.force_list(self.output) + self.target = utils.force_list(self.target) + self.input = utils.force_list(self.input) + self.loss_type = utils.force_list(self.loss_type) + self.loss = utils.force_list(self.loss) + self.regularizations = utils.force_list(self.regularizations) if self.is_local: self.set_loss() self._post_build() diff --git a/architectures/tensorflow_components/middleware.py b/architectures/tensorflow_components/middleware.py index dfe1597..3ef2631 100644 --- a/architectures/tensorflow_components/middleware.py +++ b/architectures/tensorflow_components/middleware.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np diff --git a/architectures/tensorflow_components/shared_variables.py b/architectures/tensorflow_components/shared_variables.py index 2775251..23d179e 100644 --- a/architectures/tensorflow_components/shared_variables.py +++ b/architectures/tensorflow_components/shared_variables.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np @@ -79,4 +78,4 @@ class SharedRunningStats(object): @property def shape(self): - return self._shape \ No newline at end of file + return self._shape diff --git a/coach.py b/coach.py index 8ba8cf3..73f40dc 100644 --- a/coach.py +++ b/coach.py @@ -13,46 +13,42 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import sys, inspect, re -import os -import json -import presets -from presets import * -from utils import set_gpu, list_all_classes_in_module -from architectures import * -from environments import * -from agents import * -from utils import * -from logger import screen, logger -import argparse -from subprocess import Popen -import datetime -import presets import atexit -import sys +import json +import os +import re import subprocess -from threading import Thread +import sys +import time -if len(set(failed_imports)) > 0: - screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) +import agents +import argparse +import configurations as conf +import environments +import logger +import presets +import utils + + +if len(set(logger.failed_imports)) > 0: + logger.screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(logger.failed_imports)))) def set_framework(framework_type): # choosing neural network framework - framework = Frameworks().get(framework_type) + framework = conf.Frameworks().get(framework_type) sess = None - if framework == Frameworks.TensorFlow: + if framework == conf.Frameworks.TensorFlow: import tensorflow as tf config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=config) - elif framework == Frameworks.Neon: + elif framework == conf.Frameworks.Neon: import ngraph as ng sess = ng.transformers.make_transformer() - screen.log_title("Using {} framework".format(Frameworks().to_string(framework))) + logger.screen.log_title("Using {} framework".format(conf.Frameworks().to_string(framework))) return sess @@ -66,8 +62,8 @@ def check_input_and_fill_run_dict(parser): # list available presets if args.list: - presets_lists = list_all_classes_in_module(presets) - screen.log_title("Available Presets:") + presets_lists = utils.list_all_classes_in_module(presets) + logger.screen.log_title("Available Presets:") for preset in presets_lists: print(preset) sys.exit(0) @@ -77,28 +73,28 @@ def check_input_and_fill_run_dict(parser): # num_workers = int(args.num_workers) num_workers = int(re.match("^\d+$", args.num_workers).group(0)) except ValueError: - screen.error("Parameter num_workers should be an integer.") + logger.screen.error("Parameter num_workers should be an integer.") - preset_names = list_all_classes_in_module(presets) + preset_names = utils.list_all_classes_in_module(presets) if args.preset is not None and args.preset not in preset_names: - screen.error("A non-existing preset was selected. ") + logger.screen.error("A non-existing preset was selected. ") if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir): - screen.error("The requested checkpoint folder to load from does not exist. ") + logger.screen.error("The requested checkpoint folder to load from does not exist. ") if args.save_model_sec is not None: try: args.save_model_sec = int(args.save_model_sec) except ValueError: - screen.error("Parameter save_model_sec should be an integer.") + logger.screen.error("Parameter save_model_sec should be an integer.") if args.preset is None and (args.agent_type is None or args.environment_type is None or args.exploration_policy_type is None) and not args.play: - screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' + logger.screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' ' environment_type and exploration_policy_type to assemble a preset. ' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type is None: - screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' + logger.screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' ' the user is expected to input the desired environment_type and level.' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type: @@ -106,11 +102,11 @@ def check_input_and_fill_run_dict(parser): args.exploration_policy_type = 'ExplorationParameters' # get experiment name and path - experiment_name = logger.get_experiment_name(args.experiment_name) - experiment_path = logger.get_experiment_path(experiment_name) + experiment_name = logger.logger.get_experiment_name(args.experiment_name) + experiment_path = logger.logger.get_experiment_path(experiment_name) if args.play and num_workers > 1: - screen.warning("Playing the game as a human is only available with a single worker. " + logger.screen.warning("Playing the game as a human is only available with a single worker. " "The number of workers will be reduced to 1") num_workers = 1 @@ -123,7 +119,7 @@ def check_input_and_fill_run_dict(parser): run_dict['preset'] = args.preset run_dict['custom_parameter'] = args.custom_parameter run_dict['experiment_path'] = experiment_path - run_dict['framework'] = Frameworks().get(args.framework) + run_dict['framework'] = conf.Frameworks().get(args.framework) run_dict['play'] = args.play run_dict['evaluate'] = args.evaluate# or args.play @@ -251,16 +247,16 @@ if __name__ == "__main__": os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # dump documentation - logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) + logger.logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) if not args.no_summary: - atexit.register(logger.summarize_experiment) - screen.change_terminal_title(logger.experiment_name) + atexit.register(logger.logger.summarize_experiment) + logger.screen.change_terminal_title(logger.logger.experiment_name) # Single-threaded runs if run_dict['num_threads'] == 1: # set tuning parameters json_run_dict_path = run_dict_to_json(run_dict) - tuning_parameters = json_to_preset(json_run_dict_path) + tuning_parameters = presets.json_to_preset(json_run_dict_path) tuning_parameters.sess = set_framework(args.framework) if args.print_parameters: @@ -268,8 +264,9 @@ if __name__ == "__main__": # Single-thread runs tuning_parameters.task_index = 0 - env_instance = create_environment(tuning_parameters) - agent = eval(tuning_parameters.agent.type + '(env_instance, tuning_parameters)') + env_instance = environments.create_environment(tuning_parameters) + agent = eval('agents.' + tuning_parameters.agent.type + + '(env_instance, tuning_parameters)') # Start the training or evaluation if tuning_parameters.evaluate: @@ -282,11 +279,11 @@ if __name__ == "__main__": assert args.framework.lower() == 'tensorflow', "Distributed training works only with TensorFlow" os.environ["OMP_NUM_THREADS"]="1" # set parameter server and workers addresses - ps_hosts = "localhost:{}".format(get_open_port()) - worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(run_dict['num_threads'] + 1)]) + ps_hosts = "localhost:{}".format(utils.get_open_port()) + worker_hosts = ",".join(["localhost:{}".format(utils.get_open_port()) for i in range(run_dict['num_threads'] + 1)]) # Make sure to disable GPU so that all the workers will use the CPU - set_cpu() + utils.set_cpu() # create a parameter server cmd = [ @@ -296,9 +293,9 @@ if __name__ == "__main__": "--worker_hosts={}".format(worker_hosts), "--job_name=ps", ] - parameter_server = Popen(cmd) + parameter_server = subprocess.Popen(cmd) - screen.log_title("*** Distributed Training ***") + logger.screen.log_title("*** Distributed Training ***") time.sleep(1) # create N training workers and 1 evaluating worker @@ -321,7 +318,7 @@ if __name__ == "__main__": "--job_name=worker", "--load_json={}".format(json_run_dict_path)] - p = Popen(workers_args) + p = subprocess.Popen(workers_args) if i != run_dict['num_threads']: workers.append(p) diff --git a/configurations.py b/configurations.py index 5e553d8..f2054a3 100644 --- a/configurations.py +++ b/configurations.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from utils import Enum import json + import types +import utils -class Frameworks(Enum): +class Frameworks(utils.Enum): TensorFlow = 1 Neon = 2 diff --git a/dashboard.py b/dashboard.py index f21b13b..b2aaa0f 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,29 +19,24 @@ To run Coach Dashboard, run the following command: python3 dashboard.py """ -from utils import * -import os -import datetime - -import sys -import wx -import random -import pandas as pd -from pandas.io.common import EmptyDataError -import numpy as np import colorsys -from bokeh.palettes import Dark2 -from bokeh.layouts import row, column, widgetbox, Spacer -from bokeh.models import ColumnDataSource, Range1d, LinearAxis, HoverTool, WheelZoomTool, PanTool, Legend -from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup -from bokeh.models.glyphs import Patch -from bokeh.plotting import figure, show, curdoc -from utils import force_list -from utils import squeeze_list -from itertools import cycle -from os import listdir -from os.path import isfile, join, isdir, basename -from enum import Enum +import datetime +import enum +import itertools +import os +import random + +from bokeh import palettes +from bokeh import layouts as bl +from bokeh import models as bm +from bokeh.models import widgets as bw +from bokeh import plotting as bp +import numpy as np +import pandas as pd +from pandas.io import pandas_common +import wx + +import utils class DialogApp(wx.App): @@ -67,7 +62,7 @@ class Signal: self.name = name self.full_name = "{}/{}".format(parent.filename, self.name) self.selected = False - self.color = random.choice(Dark2[8]) + self.color = random.choice(palettes.Dark2[8]) self.line = None self.bands = None self.bokeh_source = parent.bokeh_source @@ -79,12 +74,12 @@ class Signal: if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name: self.sub_signals.append(name) if len(self.sub_signals) > 1: - self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]]) - self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]]) - self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]]) - self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]]) + self.mean_signal = utils.squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]]) + self.stdev_signal = utils.squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]]) + self.min_signal = utils.squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]]) + self.max_signal = utils.squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]]) else: - self.mean_signal = squeeze_list(self.name) + self.mean_signal = utils.squeeze_list(self.name) self.stdev_signal = None self.min_signal = None self.max_signal = None @@ -107,16 +102,16 @@ class Signal: if self.selected != val: self.selected = val if self.line: - # self.set_color(Dark2[8][current_color]) - # current_color = (current_color + 1) % len(Dark2[8]) + # self.set_color(palettes.Dark2[8][current_color]) + # current_color = (current_color + 1) % len(palettes.Dark2[8]) self.line.visible = self.selected if self.bands: self.bands.visible = self.selected and self.show_bollinger_bands elif self.selected: # lazy plotting - plot only when selected for the first time show_spinner() - self.set_color(Dark2[8][current_color]) - current_color = (current_color + 1) % len(Dark2[8]) + self.set_color(palettes.Dark2[8][current_color]) + current_color = (current_color + 1) % len(palettes.Dark2[8]) if self.has_bollinger_bands: self.set_bands_source() self.create_bands() @@ -149,7 +144,7 @@ class Signal: if self.bollinger_bands_source: self.bollinger_bands_source.data = source_data else: - self.bollinger_bands_source = ColumnDataSource(source_data) + self.bollinger_bands_source = bm.ColumnDataSource(source_data) def change_bollinger_bands_state(self, new_state): self.show_bollinger_bands = new_state @@ -192,11 +187,11 @@ class SignalsFileBase: def update_source_and_signals(self): # create bokeh data sources - self.bokeh_source_orig = ColumnDataSource(self.csv) + self.bokeh_source_orig = bm.ColumnDataSource(self.csv) self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis] if self.bokeh_source is None: - self.bokeh_source = ColumnDataSource(self.csv) + self.bokeh_source = bm.ColumnDataSource(self.csv) else: # self.bokeh_source.data = self.bokeh_source_orig.data # smooth the data if necessary @@ -282,7 +277,7 @@ class SignalsFile(SignalsFileBase): def __init__(self, csv_path, load=True): SignalsFileBase.__init__(self) self.full_csv_path = csv_path - self.dir, self.filename, _ = break_file_path(csv_path) + self.dir, self.filename, _ = utils.break_file_path(csv_path) if load: self.load() # this helps set the correct x axis @@ -296,7 +291,7 @@ class SignalsFile(SignalsFileBase): try: self.csv = pd.read_csv(self.full_csv_path) break - except EmptyDataError: + except pandas_common.EmptyDataError: self.csv = None continue self.csv = self.csv.interpolate() @@ -327,7 +322,7 @@ class SignalsFilesGroup(SignalsFileBase): else: # get the common directory for all the experiments self.dir = os.path.dirname(os.path.commonprefix(csv_paths)) - self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files)) + self.filename = '{} - Group({})'.format(os.path.basename(self.dir), len(self.signals_files)) self.load() # this helps set the correct x axis @@ -425,7 +420,7 @@ class SignalsFilesGroup(SignalsFileBase): pass -class RunType(Enum): +class RunType(enum.Enum): SINGLE_FOLDER_SINGLE_FILE = 1 SINGLE_FOLDER_MULTIPLE_FILES = 2 MULTIPLE_FOLDERS_SINGLE_FILES = 3 @@ -433,7 +428,7 @@ class RunType(Enum): UNKNOWN = 0 -class FolderType(Enum): +class FolderType(enum.Enum): SINGLE_FILE = 1 MULTIPLE_FILES = 2 MULTIPLE_FOLDERS = 3 @@ -454,24 +449,24 @@ root_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(root_dir, 'spinner.css'), 'r') as f: spinner_style = """""".format(f.read()) spinner_html = """""" -spinner = Div(text="""""") +spinner = bw.Div(text="""""") # file refresh time placeholder -refresh_info = Div(text="""""", width=210) +refresh_info = bw.Div(text="""""", width=210) # create figures -plot = figure(plot_width=1200, plot_height=800, - tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save', - toolbar_location='above', x_axis_label='Episodes', - x_range=Range1d(0, 10000), y_range=Range1d(0, 100000)) -plot.extra_y_ranges = {"secondary": Range1d(start=-100, end=200)} -plot.add_layout(LinearAxis(y_range_name="secondary"), 'right') +plot = bp.figure(plot_width=1200, plot_height=800, + tools='pan,box_zoom,wheel_zoom,crosshair,undo,redo,reset,save', + toolbar_location='above', x_axis_label='Episodes', + x_range=bm.Range1d(0, 10000), y_range=bm.Range1d(0, 100000)) +plot.extra_y_ranges = {"secondary": bm.Range1d(start=-100, end=200)} +plot.add_layout(bm.LinearAxis(y_range_name="secondary"), 'right') # legend -div = Div(text="""""") -legend = widgetbox([div]) +div = bw.Div(text="""""") +legend = bl.widgetbox([div]) -bokeh_legend = Legend( +bokeh_legend = bm.Legend( items=[("12345678901234567890123456789012345678901234567890", [])], # 50 letters # items=[(" ", [])], # 50 letters location=(-20, 0), orientation="vertical", @@ -605,8 +600,8 @@ def load_files_group(): # classify the folder as containing a single file, multiple files or only folders def classify_folder(dir_path): - files = [f for f in listdir(dir_path) if isfile(join(dir_path, f)) and f.endswith('.csv')] - folders = [d for d in listdir(dir_path) if isdir(join(dir_path, d))] + files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.csv')] + folders = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))] if len(files) == 1: return FolderType.SINGLE_FILE elif len(files) > 1: @@ -628,7 +623,7 @@ def get_run_type(dir_path): elif folder_type == FolderType.MULTIPLE_FOLDERS: # folder contains sub dirs -> we assume we can classify the folder using only the first sub dir - sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))] + sub_dirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))] # checking only the first folder in the root dir for its type, since we assume that all sub dirs will share the # same structure (i.e. if one is a result of multi-threaded run, so will all the other). @@ -645,12 +640,12 @@ def add_directory_csv_files(dir_path, paths=None): if not paths: paths = [] - for p in listdir(dir_path): - path = join(dir_path, p) - if isdir(path): + for p in os.listdir(dir_path): + path = os.path.join(dir_path, p) + if os.path.isdir(path): # call recursively for each dir paths = add_directory_csv_files(path, paths) - elif isfile(path) and path.endswith('.csv'): + elif os.path.isfile(path) and path.endswith('.csv'): # add every file to the list paths.append(path) @@ -667,7 +662,7 @@ def handle_dir(dir_path, run_type): elif run_type == RunType.MULTIPLE_FOLDERS_SINGLE_FILES: create_files_group_signal(paths) elif run_type == RunType.MULTIPLE_FOLDERS_MULTIPLE_FILES: - sub_dirs = [d for d in listdir(dir_path) if isdir(join(dir_path, d))] + sub_dirs = [d for d in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, d))] # for d in sub_dirs: # paths = add_directory_csv_files(os.path.join(dir_path, d)) # create_files_group_signal(paths) @@ -731,7 +726,7 @@ def unload_file(): selected_file.hide_all_signals() del signals_files[selected_file.filename] data_selector.options = [""] - filenames = cycle(files_selector.options) + filenames = itertools.cycle(files_selector.options) files_selector.options.remove(selected_file.filename) if len(files_selector.options) > 0: files_selector.value = next(filenames) @@ -869,48 +864,48 @@ crcolor, crRGBs = generate_color_range(color_resolution, brightness) # produce # ---------------- Build Website Layout ------------------- # select file -file_selection_button = Button(label="Select Files", button_type="success", width=120) +file_selection_button = bw.Button(label="Select Files", button_type="success", width=120) file_selection_button.on_click(load_files_group) -files_selector_spacer = Spacer(width=10) +files_selector_spacer = bl.Spacer(width=10) -group_selection_button = Button(label="Select Directory", button_type="primary", width=140) +group_selection_button = bw.Button(label="Select Directory", button_type="primary", width=140) group_selection_button.on_click(load_directory_group) -unload_file_button = Button(label="Unload", button_type="danger", width=50) +unload_file_button = bw.Button(label="Unload", button_type="danger", width=50) unload_file_button.on_click(unload_file) # files selection box -files_selector = Select(title="Files:", options=[], width=200) +files_selector = bw.Select(title="Files:", options=[], width=200) files_selector.on_change('value', change_data_selector) # data selection box -data_selector = MultiSelect(title="Data:", options=[], size=12) +data_selector = bw.MultiSelect(title="Data:", options=[], size=12) data_selector.on_change('value', select_data) # x axis selection box -x_axis_selector_title = Div(text="""X Axis:""") -x_axis_selector = RadioButtonGroup(labels=x_axis_options, active=0) +x_axis_selector_title = bw.Div(text="""X Axis:""") +x_axis_selector = bw.RadioButtonGroup(labels=x_axis_options, active=0) x_axis_selector.on_click(change_x_axis) -# toggle second axis button -toggle_second_axis_button = Button(label="Toggle Second Axis", button_type="success") +# toggle second axis bw.button +toggle_second_axis_button = bw.Button(label="Toggle Second Axis", button_type="success") toggle_second_axis_button.on_click(toggle_second_axis) # averaging slider -averaging_slider = Slider(title="Averaging window", start=1, end=101, step=10) +averaging_slider = bw.Slider(title="Averaging window", start=1, end=101, step=10) averaging_slider.on_change('value', update_averaging) # group properties checkbox -group_cb = CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[]) +group_cb = bw.CheckboxGroup(labels=["Show statistics bands", "Ungroup signals"], active=[]) group_cb.on_click(toggle_group_property) # color selector -color_selector_title = Div(text="""Select Color:""") -crsource = ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs)) -color_selector = figure(x_range=(0, color_resolution), y_range=(0, 10), - plot_width=300, plot_height=40, - tools='tap') +color_selector_title = bw.Div(text="""Select Color:""") +crsource = bm.ColumnDataSource(data=dict(x=crx, y=cry, crcolor=crcolor, RGBs=crRGBs)) +color_selector = bp.figure(x_range=(0, color_resolution), y_range=(0, 10), + plot_width=300, plot_height=40, + tools='tap') color_selector.axis.visible = False color_range = color_selector.rect(x='x', y='y', width=1, height=10, color='crcolor', source=crsource) @@ -920,43 +915,43 @@ color_selector.toolbar.logo = None color_selector.toolbar_location = None # title -title = Div(text="""

Coach Dashboard

""") +title = bw.Div(text="""

Coach Dashboard

""") # landing page -landing_page_description = Div(text="""

Start by selecting an experiment file or directory to open:

""") -center = Div(text="""""") -center_buttons = Div(text="""""", width=0) -landing_page = column(center, +landing_page_description = bw.Div(text="""

Start by selecting an experiment file or directory to open:

""") +center = bw.Div(text="""""") +center_buttons = bw.Div(text="""""", width=0) +landing_page = bl.column(center, title, landing_page_description, - row(center_buttons), - row(file_selection_button, sizing_mode='scale_width'), - row(group_selection_button, sizing_mode='scale_width'), + bl.row(center_buttons), + bl.row(file_selection_button, sizing_mode='scale_width'), + bl.row(group_selection_button, sizing_mode='scale_width'), sizing_mode='scale_width') # main layout of the document -layout = row(file_selection_button, files_selector_spacer, group_selection_button, width=300) -layout = column(layout, files_selector) -layout = column(layout, row(refresh_info, unload_file_button)) -layout = column(layout, data_selector) -layout = column(layout, color_selector_title) -layout = column(layout, color_selector) -layout = column(layout, x_axis_selector_title) -layout = column(layout, x_axis_selector) -layout = column(layout, group_cb) -layout = column(layout, toggle_second_axis_button) -layout = column(layout, averaging_slider) -# layout = column(layout, legend) -layout = row(layout, plot) -layout = column(title, layout) -layout = column(layout, spinner) +layout = bl.row(file_selection_button, files_selector_spacer, group_selection_button, width=300) +layout = bl.column(layout, files_selector) +layout = bl.column(layout, bl.row(refresh_info, unload_file_button)) +layout = bl.column(layout, data_selector) +layout = bl.column(layout, color_selector_title) +layout = bl.column(layout, color_selector) +layout = bl.column(layout, x_axis_selector_title) +layout = bl.column(layout, x_axis_selector) +layout = bl.column(layout, group_cb) +layout = bl.column(layout, toggle_second_axis_button) +layout = bl.column(layout, averaging_slider) +# layout = bl.column(layout, legend) +layout = bl.row(layout, plot) +layout = bl.column(title, layout) +layout = bl.column(layout, spinner) -doc = curdoc() +doc = bp.curdoc() doc.add_root(landing_page) doc.add_periodic_callback(reload_all_files, 20000) -plot.y_range = Range1d(0, 100) -plot.extra_y_ranges['secondary'] = Range1d(0, 100) +plot.y_range = bm.Range1d(0, 100) +plot.extra_y_ranges['secondary'] = bm.Range1d(0, 100) # show load file dialog immediately on start #doc.add_timeout_callback(load_files, 1000) diff --git a/debug_utils.py b/debug_utils.py index 11db9fc..9ee79a1 100644 --- a/debug_utils.py +++ b/debug_utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import matplotlib.pyplot as plt import numpy as np diff --git a/docs/docs/mdx_math.py b/docs/docs/mdx_math.py index fe28d11..b59158e 100644 --- a/docs/docs/mdx_math.py +++ b/docs/docs/mdx_math.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,9 +24,9 @@ Adds support for displaying math formulas using [MathJax](http://www.mathjax.org Author: 2015, Dmitry Shachnev . ''' - import markdown + class MathExtension(markdown.extensions.Extension): def __init__(self, *args, **kwargs): self.config = { diff --git a/docs/docs/setup.py b/docs/docs/setup.py index bfac4df..a45ed50 100644 --- a/docs/docs/setup.py +++ b/docs/docs/setup.py @@ -1,5 +1,6 @@ +#!/usr/bin/env python3 # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -#!/usr/bin/env python3 - from distutils.core import setup + long_description = \ """This extension adds math formulas support to Python-Markdown_ (works with version 2.6 or newer). diff --git a/docs/fix_index.py b/docs/fix_index.py index 45a79c4..87e8373 100644 --- a/docs/fix_index.py +++ b/docs/fix_index.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import fnmatch +import os + -import os, fnmatch, sys def findReplace(directory, find, replace, filePattern): for path, dirs, files in os.walk(os.path.abspath(directory)): for filename in fnmatch.filter(files, filePattern): @@ -25,7 +27,8 @@ def findReplace(directory, find, replace, filePattern): with open(filepath, "w") as f: f.write(s) -if __name__=="__main__": + +if __name__ == "__main__": findReplace('./site/', '/"', '/index.html"', "*.html") findReplace('./site/', '"/index.html"', '"./index.html"', "*.html") findReplace('./site/', '"."', '"./index.html"', "*.html") @@ -34,4 +37,4 @@ if __name__=="__main__": findReplace('./site/', '/#', '/index.html#', "search_index.json") findReplace('./site/assets/javascripts/', 'search_index.json', 'search_index.txt', "*.js") findReplace('./site/mkdocs/js/', 'search_index.json', 'search_index.txt', "search.js") - os.rename("./site/mkdocs/search_index.json", "./site/mkdocs/search_index.txt") \ No newline at end of file + os.rename("./site/mkdocs/search_index.json", "./site/mkdocs/search_index.txt") diff --git a/environments/__init__.py b/environments/__init__.py index 1dd8d1d..0b03f0a 100644 --- a/environments/__init__.py +++ b/environments/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from logger import * -from utils import Enum, get_open_port -from environments.gym_environment_wrapper import * -from environments.doom_environment_wrapper import * -from environments.carla_environment_wrapper import * +from environments.gym_environment_wrapper import GymEnvironmentWrapper +from environments.doom_environment_wrapper import DoomEnvironmentWrapper +from environments.carla_environment_wrapper import CarlaEnvironmentWrapper +import utils -class EnvTypes(Enum): +class EnvTypes(utils.Enum): Doom = "DoomEnvironmentWrapper" Gym = "GymEnvironmentWrapper" Carla = "CarlaEnvironmentWrapper" @@ -31,6 +29,3 @@ def create_environment(tuning_parameters): env_type_name, env_type = EnvTypes().verify(tuning_parameters.env.type) env = eval(env_type)(tuning_parameters) return env - - - diff --git a/environments/carla_environment_wrapper.py b/environments/carla_environment_wrapper.py index b4657a3..60c4b38 100644 --- a/environments/carla_environment_wrapper.py +++ b/environments/carla_environment_wrapper.py @@ -1,34 +1,31 @@ +import logging +import os +import signal +import subprocess import sys -from os import path, environ - -try: - if 'CARLA_ROOT' in environ: - sys.path.append(path.join(environ.get('CARLA_ROOT'), 'PythonClient')) - from carla.client import CarlaClient - from carla.settings import CarlaSettings - from carla.tcp import TCPConnectionError - from carla.sensor import Camera - from carla.client import VehicleControl -except ImportError: - from logger import failed_imports - failed_imports.append("CARLA") import numpy as np -import time -import logging -import subprocess -import signal -from environments.environment_wrapper import EnvironmentWrapper -from utils import * -from logger import screen, logger -from PIL import Image + +import logger +try: + if 'CARLA_ROOT' in os.environ: + sys.path.append(os.path.join(os.environ.get('CARLA_ROOT'), + 'PythonClient')) + from carla import client as carla_client + from carla import settings as carla_settings + from carla import sensor as carla_sensor +except ImportError: + logger.failed_imports.append("CARLA") +from environments import environment_wrapper as ew +import utils # enum of the available levels and their path -class CarlaLevel(Enum): +class CarlaLevel(utils.Enum): TOWN1 = "/Game/Maps/Town01" TOWN2 = "/Game/Maps/Town02" + key_map = { 'BRAKE': (274,), # down arrow 'GAS': (273,), # up arrow @@ -41,16 +38,16 @@ key_map = { } -class CarlaEnvironmentWrapper(EnvironmentWrapper): +class CarlaEnvironmentWrapper(ew.EnvironmentWrapper): def __init__(self, tuning_parameters): - EnvironmentWrapper.__init__(self, tuning_parameters) + ew.EnvironmentWrapper.__init__(self, tuning_parameters) self.tp = tuning_parameters # server configuration self.server_height = self.tp.env.server_height self.server_width = self.tp.env.server_width - self.port = get_open_port() + self.port = utils.get_open_port() self.host = 'localhost' self.map = CarlaLevel().get(self.tp.env.level) @@ -70,7 +67,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper): self.settings = fp.read() else: # hard coded settings - self.settings = CarlaSettings() + self.settings = carla_settings.CarlaSettings() self.settings.set( SynchronousMode=True, SendNonPlayerAgentsInfo=False, @@ -80,7 +77,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper): self.settings.randomize_seeds() # add cameras - camera = Camera('CameraRGB') + camera = carla_sensor.Camera('CameraRGB') camera.set_image_size(self.width, self.height) camera.set_position(200, 0, 140) camera.set_rotation(0, 0, 0) @@ -92,7 +89,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper): logging.disable(40) # open the client - self.game = CarlaClient(self.host, self.port, timeout=99999999) + self.game = carla_client.CarlaClient(self.host, self.port, timeout=99999999) self.game.connect() scene = self.game.load_settings(self.settings) @@ -141,12 +138,12 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper): self.renderer.create_screen(image.shape[1], image.shape[0]) def _open_server(self): - log_path = path.join(logger.experiments_path, "CARLA_LOG_{}.txt".format(self.port)) + log_path = os.path.join(logger.logger.experiments_path, "CARLA_LOG_{}.txt".format(self.port)) with open(log_path, "wb") as out: - cmd = [path.join(environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map, - "-benchmark", "-carla-server", "-fps=10", "-world-port={}".format(self.port), - "-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height), - "-carla-no-hud"] + cmd = [os.path.join(os.environ.get('CARLA_ROOT'), 'CarlaUE4.sh'), self.map, + "-benchmark", "-carla-server", "-fps=10", "-world-port={}".format(self.port), + "-windowed -ResX={} -ResY={}".format(self.server_width, self.server_height), + "-carla-no-hud"] if self.config: cmd.append("-carla-settings={}".format(self.config)) p = subprocess.Popen(cmd, stdout=out, stderr=out) @@ -201,7 +198,7 @@ class CarlaEnvironmentWrapper(EnvironmentWrapper): action = action_idx self.last_action_idx = action - self.control = VehicleControl() + self.control = carla_client.VehicleControl() self.control.throttle = np.clip(action[0], 0, 1) self.control.steer = np.clip(action[1], -1, 1) self.control.brake = np.abs(np.clip(action[0], -1, 0)) diff --git a/environments/doom_environment_wrapper.py b/environments/doom_environment_wrapper.py index a0c618d..716a6d0 100644 --- a/environments/doom_environment_wrapper.py +++ b/environments/doom_environment_wrapper.py @@ -13,23 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import enum +import os +import numpy as np +import logger try: import vizdoom except ImportError: - from logger import failed_imports - failed_imports.append("ViZDoom") + logger.failed_imports.append("ViZDoom") -import numpy as np -from environments.environment_wrapper import EnvironmentWrapper -from os import path, environ -from utils import * -from logger import * +from environments import environment_wrapper as ew +import utils # enum of the available levels and their path -class DoomLevel(Enum): +class DoomLevel(utils.Enum): BASIC = "basic.cfg" DEFEND = "defend_the_center.cfg" DEATHMATCH = "deathmatch.cfg" @@ -40,6 +40,7 @@ class DoomLevel(Enum): DEFEND_THE_LINE = "defend_the_line.cfg" DEADLY_CORRIDOR = "deadly_corridor.cfg" + key_map = { 'NO-OP': 96, # ` 'ATTACK': 13, # enter @@ -78,15 +79,16 @@ key_map = { } -class DoomEnvironmentWrapper(EnvironmentWrapper): +class DoomEnvironmentWrapper(ew.EnvironmentWrapper): def __init__(self, tuning_parameters): - EnvironmentWrapper.__init__(self, tuning_parameters) + ew.EnvironmentWrapper.__init__(self, tuning_parameters) # load the emulator with the required level self.level = DoomLevel().get(self.tp.env.level) - self.scenarios_dir = path.join(environ.get('VIZDOOM_ROOT'), 'scenarios') + self.scenarios_dir = os.path.join(os.environ.get('VIZDOOM_ROOT'), + 'scenarios') self.game = vizdoom.DoomGame() - self.game.load_config(path.join(self.scenarios_dir, self.level)) + self.game.load_config(os.path.join(self.scenarios_dir, self.level)) self.game.set_window_visible(False) self.game.add_game_args("+vid_forcesurface 1") diff --git a/environments/environment_wrapper.py b/environments/environment_wrapper.py index d077c39..640ac91 100644 --- a/environments/environment_wrapper.py +++ b/environments/environment_wrapper.py @@ -13,14 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import numpy as np -from utils import * -from configurations import Preset -from renderer import Renderer import operator import time +import numpy as np + +import renderer +import utils + class EnvironmentWrapper(object): def __init__(self, tuning_parameters): @@ -50,7 +50,7 @@ class EnvironmentWrapper(object): self.height = 1 self.is_state_type_image = True self.measurements_size = 0 - self.phase = RunPhase.TRAIN + self.phase = utils.RunPhase.TRAIN self.tp = tuning_parameters self.record_video_every = self.tp.visualization.record_video_every self.env_id = self.tp.env.level @@ -62,7 +62,7 @@ class EnvironmentWrapper(object): self.wait_for_explicit_human_action = False self.is_rendered = self.is_rendered or self.human_control self.game_is_open = True - self.renderer = Renderer() + self.renderer = renderer.Renderer() @property def measurements(self): diff --git a/environments/gym_environment_wrapper.py b/environments/gym_environment_wrapper.py index c821bf8..0ec22d8 100644 --- a/environments/gym_environment_wrapper.py +++ b/environments/gym_environment_wrapper.py @@ -13,40 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import random -import sys -from logger import * import gym import numpy as np -import time -import random -try: - import roboschool - from OpenGL import GL -except ImportError: - from logger import failed_imports - failed_imports.append("RoboSchool") -try: - from gym_extensions.continuous import mujoco -except: - from logger import failed_imports - failed_imports.append("GymExtensions") - -try: - import pybullet_envs -except ImportError: - from logger import failed_imports - failed_imports.append("PyBullet") - -from gym import wrappers -from utils import force_list, RunPhase -from environments.environment_wrapper import EnvironmentWrapper +from environments import environment_wrapper as ew +import utils -class GymEnvironmentWrapper(EnvironmentWrapper): +class GymEnvironmentWrapper(ew.EnvironmentWrapper): def __init__(self, tuning_parameters): - EnvironmentWrapper.__init__(self, tuning_parameters) + ew.EnvironmentWrapper.__init__(self, tuning_parameters) # env parameters if ':' in self.env_id: @@ -124,7 +102,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper): def _update_state(self): if hasattr(self.env, 'env') and hasattr(self.env.env, 'ale'): - if self.phase == RunPhase.TRAIN and hasattr(self, 'current_ale_lives'): + if self.phase == utils.RunPhase.TRAIN and hasattr(self, 'current_ale_lives'): # signal termination for life loss if self.current_ale_lives != self.env.env.ale.lives(): self.done = True @@ -146,7 +124,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper): if type(action_idx) == int and action_idx == 0: # deal with the "reset" action 0 action = [0] * self.env.action_space.shape[0] - action = np.array(force_list(action)) + action = np.array(utils.force_list(action)) # removing redundant dimensions such that the action size will match the expected action size from gym if action.shape != self.env.action_space.shape: action = np.squeeze(action) diff --git a/exploration_policies/__init__.py b/exploration_policies/__init__.py index 260d958..101794a 100644 --- a/exploration_policies/__init__.py +++ b/exploration_policies/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,16 +13,29 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from exploration_policies.additive_noise import AdditiveNoise +from exploration_policies.approximated_thompson_sampling_using_dropout import ApproximatedThompsonSamplingUsingDropout +from exploration_policies.bayesian import Bayesian +from exploration_policies.boltzmann import Boltzmann +from exploration_policies.bootstrapped import Bootstrapped +from exploration_policies.categorical import Categorical +from exploration_policies.continuous_entropy import ContinuousEntropy +from exploration_policies.e_greedy import EGreedy +from exploration_policies.exploration_policy import ExplorationPolicy +from exploration_policies.greedy import Greedy +from exploration_policies.ou_process import OUProcess +from exploration_policies.thompson_sampling import ThompsonSampling -from exploration_policies.additive_noise import * -from exploration_policies.approximated_thompson_sampling_using_dropout import * -from exploration_policies.bayesian import * -from exploration_policies.boltzmann import * -from exploration_policies.bootstrapped import * -from exploration_policies.categorical import * -from exploration_policies.continuous_entropy import * -from exploration_policies.e_greedy import * -from exploration_policies.exploration_policy import * -from exploration_policies.greedy import * -from exploration_policies.ou_process import * -from exploration_policies.thompson_sampling import * + +__all__ = [AdditiveNoise, + ApproximatedThompsonSamplingUsingDropout, + Bayesian, + Boltzmann, + Bootstrapped, + Categorical, + ContinuousEntropy, + EGreedy, + ExplorationPolicy, + Greedy, + OUProcess, + ThompsonSampling] diff --git a/exploration_policies/additive_noise.py b/exploration_policies/additive_noise.py index d8cd7c9..124daa3 100644 --- a/exploration_policies/additive_noise.py +++ b/exploration_policies/additive_noise.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,18 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from exploration_policies.exploration_policy import * + +from exploration_policies import exploration_policy +import utils -class AdditiveNoise(ExplorationPolicy): +class AdditiveNoise(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.variance = tuning_parameters.exploration.initial_noise_variance_percentage self.final_variance = tuning_parameters.exploration.final_noise_variance_percentage self.decay_steps = tuning_parameters.exploration.noise_variance_decay_steps @@ -37,7 +38,7 @@ class AdditiveNoise(ExplorationPolicy): self.variance = self.final_variance def get_action(self, action_values): - if self.phase == RunPhase.TRAIN: + if self.phase == utils.RunPhase.TRAIN: self.decay_exploration() action = np.random.normal(action_values, 2 * self.variance * self.action_abs_range) return action #np.clip(action, -self.action_abs_range, self.action_abs_range).squeeze() diff --git a/exploration_policies/approximated_thompson_sampling_using_dropout.py b/exploration_policies/approximated_thompson_sampling_using_dropout.py index 2a51ae2..13be0d6 100644 --- a/exploration_policies/approximated_thompson_sampling_using_dropout.py +++ b/exploration_policies/approximated_thompson_sampling_using_dropout.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy -class ApproximatedThompsonSamplingUsingDropout(ExplorationPolicy): +class ApproximatedThompsonSamplingUsingDropout(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.dropout_discard_probability = tuning_parameters.exploration.dropout_discard_probability self.network = tuning_parameters.network self.assign_op = self.network.dropout_discard_probability.assign(self.dropout_discard_probability) diff --git a/exploration_policies/bayesian.py b/exploration_policies/bayesian.py index f5ab6b3..75f394d 100644 --- a/exploration_policies/bayesian.py +++ b/exploration_policies/bayesian.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,18 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * -import tensorflow as tf +from exploration_policies import exploration_policy +import utils -class Bayesian(ExplorationPolicy): +class Bayesian(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.keep_probability = tuning_parameters.exploration.initial_keep_probability self.final_keep_probability = tuning_parameters.exploration.final_keep_probability self.keep_probability_decay_delta = ( @@ -40,7 +41,7 @@ class Bayesian(ExplorationPolicy): self.keep_probability -= self.keep_probability_decay_delta def get_action(self, action_values): - if self.phase == RunPhase.TRAIN: + if self.phase == utils.RunPhase.TRAIN: self.decay_keep_probability() # dropout = self.network.get_layer('variable_dropout_1') # with tf.Session() as sess: diff --git a/exploration_policies/boltzmann.py b/exploration_policies/boltzmann.py index de954be..c8bc351 100644 --- a/exploration_policies/boltzmann.py +++ b/exploration_policies/boltzmann.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy +import utils - -class Boltzmann(ExplorationPolicy): +class Boltzmann(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.temperature = tuning_parameters.exploration.initial_temperature self.final_temperature = tuning_parameters.exploration.final_temperature self.temperature_decay_delta = ( @@ -35,7 +36,7 @@ class Boltzmann(ExplorationPolicy): self.temperature -= self.temperature_decay_delta def get_action(self, action_values): - if self.phase == RunPhase.TRAIN: + if self.phase == utils.RunPhase.TRAIN: self.decay_temperature() # softmax calculation exp_probabilities = np.exp(action_values / self.temperature) diff --git a/exploration_policies/bootstrapped.py b/exploration_policies/bootstrapped.py index 14b6d2c..5d17e75 100644 --- a/exploration_policies/bootstrapped.py +++ b/exploration_policies/bootstrapped.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.e_greedy import * +from exploration_policies import e_greedy -class Bootstrapped(EGreedy): +class Bootstrapped(e_greedy.EGreedy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running parameters :type tuning_parameters: Preset """ - EGreedy.__init__(self, tuning_parameters) + e_greedy.EGreedy.__init__(self, tuning_parameters) self.num_heads = tuning_parameters.exploration.architecture_num_q_heads self.selected_head = 0 @@ -31,7 +32,7 @@ class Bootstrapped(EGreedy): self.selected_head = np.random.randint(self.num_heads) def get_action(self, action_values): - return EGreedy.get_action(self, action_values[self.selected_head]) + return e_greedy.EGreedy.get_action(self, action_values[self.selected_head]) def get_control_param(self): return self.selected_head diff --git a/exploration_policies/categorical.py b/exploration_policies/categorical.py index 1be5509..cf8e0a5 100644 --- a/exploration_policies/categorical.py +++ b/exploration_policies/categorical.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy -class Categorical(ExplorationPolicy): +class Categorical(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) def get_action(self, action_values): # choose actions according to the probabilities diff --git a/exploration_policies/continuous_entropy.py b/exploration_policies/continuous_entropy.py index 8d7ee66..edfc3b5 100644 --- a/exploration_policies/continuous_entropy.py +++ b/exploration_policies/continuous_entropy.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy -class ContinuousEntropy(ExplorationPolicy): +class ContinuousEntropy(exploration_policy.ExplorationPolicy): pass diff --git a/exploration_policies/e_greedy.py b/exploration_policies/e_greedy.py index f0cb01f..9aa2dd7 100644 --- a/exploration_policies/e_greedy.py +++ b/exploration_policies/e_greedy.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy +import utils -class EGreedy(ExplorationPolicy): +class EGreedy(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.epsilon = tuning_parameters.exploration.initial_epsilon self.final_epsilon = tuning_parameters.exploration.final_epsilon self.epsilon_decay_delta = ( @@ -52,9 +54,9 @@ class EGreedy(ExplorationPolicy): self.variance = self.final_variance def get_action(self, action_values): - if self.phase == RunPhase.TRAIN: + if self.phase == utils.RunPhase.TRAIN: self.decay_exploration() - epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon + epsilon = self.evaluation_epsilon if self.phase == utils.RunPhase.TEST else self.epsilon if self.discrete_controls: top_action = np.argmax(action_values) @@ -67,4 +69,4 @@ class EGreedy(ExplorationPolicy): return np.squeeze(action_values + (np.random.rand() < epsilon) * noise) def get_control_param(self): - return self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon + return self.evaluation_epsilon if self.phase == utils.RunPhase.TEST else self.epsilon diff --git a/exploration_policies/exploration_policy.py b/exploration_policies/exploration_policy.py index d211054..6a9d9a7 100644 --- a/exploration_policies/exploration_policy.py +++ b/exploration_policies/exploration_policy.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import numpy as np -from utils import * -from configurations import * +import utils class ExplorationPolicy(object): @@ -25,7 +22,7 @@ class ExplorationPolicy(object): :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - self.phase = RunPhase.HEATUP + self.phase = utils.RunPhase.HEATUP self.action_space_size = tuning_parameters.env.action_space_size self.action_abs_range = tuning_parameters.env_instance.action_space_abs_range self.discrete_controls = tuning_parameters.env_instance.discrete_controls @@ -39,7 +36,7 @@ class ExplorationPolicy(object): def get_action(self, action_values): """ - Given a list of values corresponding to each action, + Given a list of values corresponding to each action, choose one actions according to the exploration policy :param action_values: A list of action values :return: The chosen action @@ -55,4 +52,4 @@ class ExplorationPolicy(object): self.phase = phase def get_control_param(self): - return 0 \ No newline at end of file + return 0 diff --git a/exploration_policies/greedy.py b/exploration_policies/greedy.py index 34acf1b..1306731 100644 --- a/exploration_policies/greedy.py +++ b/exploration_policies/greedy.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy -class Greedy(ExplorationPolicy): +class Greedy(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) def get_action(self, action_values): return np.argmax(action_values) diff --git a/exploration_policies/ou_process.py b/exploration_policies/ou_process.py index c7d5851..fcc8a9f 100644 --- a/exploration_policies/ou_process.py +++ b/exploration_policies/ou_process.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,21 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from exploration_policies.exploration_policy import * + +from exploration_policies import exploration_policy # Based on on the description in: # https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab # Ornstein-Uhlenbeck process -class OUProcess(ExplorationPolicy): +class OUProcess(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.action_space_size = tuning_parameters.env.action_space_size self.mu = float(tuning_parameters.exploration.mu) * np.ones(self.action_space_size) self.theta = tuning_parameters.exploration.theta diff --git a/exploration_policies/thompson_sampling.py b/exploration_policies/thompson_sampling.py index 265ce60..8324d87 100644 --- a/exploration_policies/thompson_sampling.py +++ b/exploration_policies/thompson_sampling.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from exploration_policies.exploration_policy import * +from exploration_policies import exploration_policy -class ThompsonSampling(ExplorationPolicy): +class ThompsonSampling(exploration_policy.ExplorationPolicy): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - ExplorationPolicy.__init__(self, tuning_parameters) + exploration_policy.ExplorationPolicy.__init__(self, tuning_parameters) self.action_space_size = tuning_parameters.env.action_space_size def get_action(self, action_values): diff --git a/logger.py b/logger.py index c070cc0..4fca581 100644 --- a/logger.py +++ b/logger.py @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from pandas import * +import datetime import os import re -from pprint import pprint -import threading -from subprocess import Popen, PIPE -import time -import datetime -from six.moves import input -from PIL import Image -from typing import Union import shutil +import time +import typing + +import pandas +import PIL +from six.moves import input global failed_imports failed_imports = [] @@ -90,7 +87,7 @@ class ScreenLogger(object): def ask_input(self, title): return input("{}{}{}".format(Colors.BG_CYAN, title, Colors.END)) - def ask_yes_no(self, title: str, default: Union[None, bool]=None): + def ask_yes_no(self, title: str, default: typing.Union[None, bool]=None): """ Ask the user for a yes / no question and return True if the answer is yes and False otherwise. The function will keep asking the user for an answer until he answers one of the possible responses. @@ -156,7 +153,7 @@ class BaseLogger(object): class Logger(BaseLogger): def __init__(self): BaseLogger.__init__(self) - self.data = DataFrame() + self.data = pandas.DataFrame() self.csv_path = '' self.doc_path = '' self.aggregated_data_across_threads = None @@ -249,7 +246,7 @@ class Logger(BaseLogger): if not os.path.exists(output_dir): os.makedirs(output_dir) output_path = os.path.join(output_dir, output_file) - pil_images = [Image.fromarray(image) for image in images] + pil_images = [PIL.Image.fromarray(image) for image in images] pil_images[0].save(output_path, save_all=True, append_images=pil_images[1:], duration=1.0 / fps, loop=0) def remove_experiment_dir(self): diff --git a/memories/__init__.py b/memories/__init__.py index 29a0894..0468453 100644 --- a/memories/__init__.py +++ b/memories/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from memories.differentiable_neural_dictionary import AnnoyDictionary +from memories.differentiable_neural_dictionary import AnnoyIndex +from memories.differentiable_neural_dictionary import QDND +from memories.episodic_experience_replay import EpisodicExperienceReplay +from memories.memory import Episode +from memories.memory import Memory +from memories.memory import Transition -from memories.differentiable_neural_dictionary import * -from memories.episodic_experience_replay import * -from memories.memory import * +__all__ = [AnnoyDictionary, + AnnoyIndex, + Episode, + EpisodicExperienceReplay, + Memory, + QDND, + Transition] diff --git a/memories/differentiable_neural_dictionary.py b/memories/differentiable_neural_dictionary.py index 1a1fdc7..d5f0246 100644 --- a/memories/differentiable_neural_dictionary.py +++ b/memories/differentiable_neural_dictionary.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os +import pickle import numpy as np from annoy import AnnoyIndex -import os, pickle class AnnoyDictionary(object): diff --git a/memories/episodic_experience_replay.py b/memories/episodic_experience_replay.py index 5930d78..936b084 100644 --- a/memories/episodic_experience_replay.py +++ b/memories/episodic_experience_replay.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,24 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import typing -from memories.memory import * -import threading -from typing import Union +import numpy as np + +from memories import memory -class EpisodicExperienceReplay(Memory): +class EpisodicExperienceReplay(memory.Memory): def __init__(self, tuning_parameters): """ :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset """ - Memory.__init__(self, tuning_parameters) + memory.Memory.__init__(self, tuning_parameters) self.tp = tuning_parameters self.max_size_in_episodes = tuning_parameters.agent.num_episodes_in_experience_replay self.max_size_in_transitions = tuning_parameters.agent.num_transitions_in_experience_replay self.discount = tuning_parameters.agent.discount - self.buffer = [Episode()] # list of episodes + self.buffer = [memory.Episode()] # list of episodes self.transitions = [] self._length = 1 self._num_transitions = 0 @@ -96,7 +97,7 @@ class EpisodicExperienceReplay(Memory): def store(self, transition): if len(self.buffer) == 0: - self.buffer.append(Episode()) + self.buffer.append(memory.Episode()) last_episode = self.buffer[-1] last_episode.insert(transition) self.transitions.append(transition) @@ -109,7 +110,7 @@ class EpisodicExperienceReplay(Memory): n_step_return=self.tp.agent.n_step) self.buffer[-1].update_measurements_targets(self.tp.agent.num_predicted_steps_ahead) # self.buffer[-1].update_actions_probabilities() # used for off-policy policy optimization - self.buffer.append(Episode()) + self.buffer.append(memory.Episode()) self.enforce_length() @@ -148,7 +149,7 @@ class EpisodicExperienceReplay(Memory): def get(self, index): return self.get_episode(index) - def get_last_complete_episode(self) -> Union[None, Episode]: + def get_last_complete_episode(self) -> typing.Union[None, memory.Episode]: """ Returns the last complete episode in the memory or None if there are no complete episodes :return: None or the last complete episode @@ -170,7 +171,7 @@ class EpisodicExperienceReplay(Memory): def clean(self): self.transitions = [] - self.buffer = [Episode()] + self.buffer = [memory.Episode()] self._length = 1 self._num_transitions = 0 self._num_transitions_in_complete_episodes = 0 diff --git a/memories/memory.py b/memories/memory.py index f4c6a87..88be6c5 100644 --- a/memories/memory.py +++ b/memories/memory.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -import copy -from configurations import * class Memory(object): diff --git a/parallel_actor.py b/parallel_actor.py index c988e49..581a357 100644 --- a/parallel_actor.py +++ b/parallel_actor.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import argparse -import tensorflow as tf -from architectures import * -from environments import * -from agents import * -from utils import * +import os import time -import copy -from logger import * -from configurations import * -from presets import * -import shutil + +import tensorflow as tf + +import agents +import environments +import logger +import presets start_time = time.time() @@ -66,15 +63,15 @@ if __name__ == "__main__": elif args.job_name == "worker": # get tuning parameters - tuning_parameters = json_to_preset(args.load_json_path) + tuning_parameters = presets.json_to_preset(args.load_json_path) # dump documentation if not os.path.exists(tuning_parameters.experiment_path): os.makedirs(tuning_parameters.experiment_path) if tuning_parameters.evaluate_only: - logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id, filename='evaluator') + logger.logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id, filename='evaluator') else: - logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id) + logger.logger.set_dump_dir(tuning_parameters.experiment_path, tuning_parameters.task_id) # multi-threading parameters tuning_parameters.start_time = start_time @@ -98,8 +95,8 @@ if __name__ == "__main__": cluster=cluster) # create the agent and the environment - env_instance = create_environment(tuning_parameters) - exec('agent = ' + tuning_parameters.agent.type + '(env_instance, tuning_parameters, replicated_device=device, ' + env_instance = environments.create_environment(tuning_parameters) + exec('agent = agents.' + tuning_parameters.agent.type + '(env_instance, tuning_parameters, replicated_device=device, ' 'thread_id=tuning_parameters.task_id)') # building the scaffold @@ -169,6 +166,6 @@ if __name__ == "__main__": else: agent.improve() else: - screen.error("Invalid mode requested for parallel_actor.") + logger.screen.error("Invalid mode requested for parallel_actor.") exit(1) diff --git a/plot_atari.py b/plot_atari.py index 8732fb0..7241000 100644 --- a/plot_atari.py +++ b/plot_atari.py @@ -1,8 +1,10 @@ import argparse +import os + import matplotlib import matplotlib.pyplot as plt + from dashboard import SignalsFile -import os class FigureMaker(object): diff --git a/presets.py b/presets.py index dcfb765..3e24c38 100644 --- a/presets.py +++ b/presets.py @@ -13,37 +13,43 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from configurations import * import ast +import json import sys +import agents +import configurations as conf +import environments as env +import exploration_policies as ep +import presets + def json_to_preset(json_path): with open(json_path, 'r') as json_file: run_dict = json.loads(json_file.read()) if run_dict['preset'] is None: - tuning_parameters = Preset(eval(run_dict['agent_type']), eval(run_dict['environment_type']), - eval(run_dict['exploration_policy_type'])) + tuning_parameters = conf.Preset(eval('agents.' + run_dict['agent_type']), + eval('env.' + run_dict['environment_type']), + eval('ep.' + run_dict['exploration_policy_type'])) else: - tuning_parameters = eval(run_dict['preset'])() + tuning_parameters = eval('presets.' + run_dict['preset'])() # Override existing parts of the preset if run_dict['agent_type'] is not None: - tuning_parameters.agent = eval(run_dict['agent_type'])() + tuning_parameters.agent = eval('agents.' + run_dict['agent_type'])() if run_dict['environment_type'] is not None: - tuning_parameters.env = eval(run_dict['environment_type'])() + tuning_parameters.env = eval('env.' + run_dict['environment_type'])() if run_dict['exploration_policy_type'] is not None: - tuning_parameters.exploration = eval(run_dict['exploration_policy_type'])() + tuning_parameters.exploration = eval('ep.' + run_dict['exploration_policy_type'])() # human control if run_dict['play']: tuning_parameters.agent.type = 'HumanAgent' tuning_parameters.env.human_control = True tuning_parameters.num_heatup_steps = 0 - + if run_dict['level']: tuning_parameters.env.level = run_dict['level'] @@ -69,9 +75,9 @@ def json_to_preset(json_path): return tuning_parameters -class Doom_Basic_DQN(Preset): +class Doom_Basic_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -79,9 +85,9 @@ class Doom_Basic_DQN(Preset): self.num_heatup_steps = 1000 -class Doom_Basic_QRDQN(Preset): +class Doom_Basic_QRDQN(conf.Preset): def __init__(self): - Preset.__init__(self, QuantileRegressionDQN, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.QuantileRegressionDQN, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.agent.num_steps_between_copying_online_weights_to_target = 1000 self.learning_rate = 0.00025 @@ -89,9 +95,9 @@ class Doom_Basic_QRDQN(Preset): self.num_heatup_steps = 1000 -class Doom_Basic_OneStepQ(Preset): +class Doom_Basic_OneStepQ(conf.Preset): def __init__(self): - Preset.__init__(self, NStepQ, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.NStepQ, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.learning_rate = 0.00025 self.num_heatup_steps = 0 @@ -101,9 +107,9 @@ class Doom_Basic_OneStepQ(Preset): self.agent.targets_horizon = '1-Step' -class Doom_Basic_NStepQ(Preset): +class Doom_Basic_NStepQ(conf.Preset): def __init__(self): - Preset.__init__(self, NStepQ, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.NStepQ, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.learning_rate = 0.000025 self.num_heatup_steps = 0 @@ -112,9 +118,9 @@ class Doom_Basic_NStepQ(Preset): self.clip_gradients = 1000 -class Doom_Basic_A2C(Preset): +class Doom_Basic_A2C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, Doom, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.Doom, conf.CategoricalExploration) self.env.level = 'basic' self.agent.policy_gradient_rescaler = 'A_VALUE' self.learning_rate = 0.00025 @@ -122,19 +128,19 @@ class Doom_Basic_A2C(Preset): self.env.reward_scaling = 100. -class Doom_Basic_Dueling_DDQN(Preset): +class Doom_Basic_Dueling_DDQN(conf.Preset): def __init__(self): - Preset.__init__(self, DDQN, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DDQN, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' - self.agent.output_types = [OutputTypes.DuelingQ] + self.agent.output_types = [conf.OutputTypes.DuelingQ] self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 self.agent.num_steps_between_copying_online_weights_to_target = 1000 self.num_heatup_steps = 1000 -class Doom_Basic_Dueling_DQN(Preset): +class Doom_Basic_Dueling_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DuelingDQN, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DuelingDQN, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -142,11 +148,11 @@ class Doom_Basic_Dueling_DQN(Preset): self.num_heatup_steps = 1000 -class CartPole_Dueling_DDQN(Preset): +class CartPole_Dueling_DDQN(conf.Preset): def __init__(self): - Preset.__init__(self, DDQN, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.DDQN, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' - self.agent.output_types = [OutputTypes.DuelingQ] + self.agent.output_types = [conf.OutputTypes.DuelingQ] self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 self.agent.num_steps_between_copying_online_weights_to_target = 100 @@ -159,9 +165,9 @@ class CartPole_Dueling_DDQN(Preset): self.test_min_return_threshold = 150 -class Doom_Health_MMC(Preset): +class Doom_Health_MMC(conf.Preset): def __init__(self): - Preset.__init__(self, MMC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.MMC, conf.Doom, conf.ExplorationParameters) self.env.level = 'HEALTH_GATHERING' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -169,9 +175,9 @@ class Doom_Health_MMC(Preset): self.num_heatup_steps = 1000 self.exploration.epsilon_decay_steps = 10000 -class CartPole_MMC(Preset): +class CartPole_MMC(conf.Preset): def __init__(self): - Preset.__init__(self, MMC, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.MMC, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.00025 @@ -185,9 +191,9 @@ class CartPole_MMC(Preset): self.test_min_return_threshold = 150 -class CartPole_PAL(Preset): +class CartPole_PAL(conf.Preset): def __init__(self): - Preset.__init__(self, PAL, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.PAL, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.00025 @@ -200,9 +206,9 @@ class CartPole_PAL(Preset): self.test_max_step_threshold = 100 self.test_min_return_threshold = 150 -class Doom_Basic_DFP(Preset): +class Doom_Basic_DFP(conf.Preset): def __init__(self): - Preset.__init__(self, DFP, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DFP, conf.Doom, conf.ExplorationParameters) self.env.level = 'BASIC' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.0001 @@ -213,9 +219,9 @@ class Doom_Basic_DFP(Preset): # self.agent.num_consecutive_playing_steps = 10 -class Doom_Health_DFP(Preset): +class Doom_Health_DFP(conf.Preset): def __init__(self): - Preset.__init__(self, DFP, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DFP, conf.Doom, conf.ExplorationParameters) self.env.level = 'HEALTH_GATHERING' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -224,9 +230,9 @@ class Doom_Health_DFP(Preset): self.agent.use_accumulated_reward_as_measurement = True -class Doom_Deadly_Corridor_Bootstrapped_DQN(Preset): +class Doom_Deadly_Corridor_Bootstrapped_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, BootstrappedDQN, Doom, BootstrappedDQNExploration) + conf.Preset.__init__(self, conf.BootstrappedDQN, conf.Doom, conf.BootstrappedDQNExploration) self.env.level = 'deadly_corridor' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -234,9 +240,9 @@ class Doom_Deadly_Corridor_Bootstrapped_DQN(Preset): self.num_heatup_steps = 1000 -class CartPole_Bootstrapped_DQN(Preset): +class CartPole_Bootstrapped_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, BootstrappedDQN, GymVectorObservation, BootstrappedDQNExploration) + conf.Preset.__init__(self, conf.BootstrappedDQN, conf.GymVectorObservation, conf.BootstrappedDQNExploration) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 200 self.learning_rate = 0.00025 @@ -249,9 +255,9 @@ class CartPole_Bootstrapped_DQN(Preset): self.test_max_step_threshold = 200 self.test_min_return_threshold = 150 -class CartPole_PG(Preset): +class CartPole_PG(conf.Preset): def __init__(self): - Preset.__init__(self, PolicyGradient, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.PolicyGradient, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'CartPole-v0' self.agent.policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP' self.learning_rate = 0.001 @@ -263,9 +269,9 @@ class CartPole_PG(Preset): self.test_min_return_threshold = 150 -class CartPole_PPO(Preset): +class CartPole_PPO(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.PPO, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'CartPole-v0' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -281,9 +287,9 @@ class CartPole_PPO(Preset): self.test_max_step_threshold = 200 self.test_min_return_threshold = 150 -class CartPole_ClippedPPO(Preset): +class CartPole_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'CartPole-v0' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -301,9 +307,9 @@ class CartPole_ClippedPPO(Preset): self.test_max_step_threshold = 200 self.test_min_return_threshold = 150 -class CartPole_A2C(Preset): +class CartPole_A2C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'CartPole-v0' self.agent.policy_gradient_rescaler = 'A_VALUE' self.learning_rate = 0.001 @@ -316,9 +322,9 @@ class CartPole_A2C(Preset): self.test_min_return_threshold = 150 -class CartPole_OneStepQ(Preset): +class CartPole_OneStepQ(conf.Preset): def __init__(self): - Preset.__init__(self, NStepQ, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.NStepQ, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.0001 @@ -327,9 +333,9 @@ class CartPole_OneStepQ(Preset): self.agent.targets_horizon = '1-Step' -class CartPole_NStepQ(Preset): +class CartPole_NStepQ(conf.Preset): def __init__(self): - Preset.__init__(self, NStepQ, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.NStepQ, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.0001 @@ -343,9 +349,9 @@ class CartPole_NStepQ(Preset): self.test_min_return_threshold = 150 self.test_num_workers = 8 -class CartPole_DQN(Preset): +class CartPole_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.00025 @@ -359,9 +365,9 @@ class CartPole_DQN(Preset): self.test_min_return_threshold = 150 -class CartPole_C51(Preset): +class CartPole_C51(conf.Preset): def __init__(self): - Preset.__init__(self, CategoricalDQN, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.CategoricalDQN, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.00025 @@ -378,9 +384,9 @@ class CartPole_C51(Preset): self.test_min_return_threshold = 150 -class CartPole_QRDQN(Preset): +class CartPole_QRDQN(conf.Preset): def __init__(self): - Preset.__init__(self, QuantileRegressionDQN, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.QuantileRegressionDQN, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.agent.num_steps_between_copying_online_weights_to_target = 100 self.learning_rate = 0.00025 @@ -394,9 +400,9 @@ class CartPole_QRDQN(Preset): # This a very resource intensive preset, and might easily blow up your RAM (> 100GB of usage). # Try reducing the number of transitions in the experience replay (50e3 might be a reasonable number to start with), # so to make sure it fits your RAM. -class Breakout_DQN(Preset): +class Breakout_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -415,9 +421,9 @@ class Breakout_DQN(Preset): # self.rescaling_interpolation_type = 'nearest' # TODO: remove -class Breakout_DDQN(Preset): +class Breakout_DDQN(conf.Preset): def __init__(self): - Preset.__init__(self, DDQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DDQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 30000 self.learning_rate = 0.00025 @@ -434,11 +440,11 @@ class Breakout_DDQN(Preset): self.agent.replace_mse_with_huber_loss = True -class Breakout_Dueling_DDQN(Preset): +class Breakout_Dueling_DDQN(conf.Preset): def __init__(self): - Preset.__init__(self, DDQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DDQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' - self.agent.output_types = [OutputTypes.DuelingQ] + self.agent.output_types = [conf.OutputTypes.DuelingQ] self.agent.num_steps_between_copying_online_weights_to_target = 30000 self.learning_rate = 0.00025 self.agent.num_transitions_in_experience_replay = 1000000 @@ -453,9 +459,9 @@ class Breakout_Dueling_DDQN(Preset): self.evaluate_every_x_episodes = 25 self.agent.replace_mse_with_huber_loss = True -class Alien_DQN(Preset): +class Alien_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'AlienDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -470,9 +476,9 @@ class Alien_DQN(Preset): self.evaluate_every_x_episodes = 5 -class Breakout_C51(Preset): +class Breakout_C51(conf.Preset): def __init__(self): - Preset.__init__(self, CategoricalDQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.CategoricalDQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -490,9 +496,9 @@ class Breakout_C51(Preset): -class Breakout_QRDQN(Preset): +class Breakout_QRDQN(conf.Preset): def __init__(self): - Preset.__init__(self, QuantileRegressionDQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.QuantileRegressionDQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -507,9 +513,9 @@ class Breakout_QRDQN(Preset): self.evaluate_every_x_episodes = 50 -class Atari_DQN_TestBench(Preset): +class Atari_DQN_TestBench(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -525,9 +531,9 @@ class Atari_DQN_TestBench(Preset): self.num_training_iterations = 500 -class Doom_Basic_PG(Preset): +class Doom_Basic_PG(conf.Preset): def __init__(self): - Preset.__init__(self, PolicyGradient, Doom, CategoricalExploration) + conf.Preset.__init__(self, conf.PolicyGradient, conf.Doom, conf.CategoricalExploration) self.env.level = 'basic' self.agent.policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP' self.learning_rate = 0.00001 @@ -535,18 +541,18 @@ class Doom_Basic_PG(Preset): self.agent.beta_entropy = 0.01 -class InvertedPendulum_PG(Preset): +class InvertedPendulum_PG(conf.Preset): def __init__(self): - Preset.__init__(self, PolicyGradient, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.PolicyGradient, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'InvertedPendulum-v1' self.agent.policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP' self.learning_rate = 0.001 self.num_heatup_steps = 0 -class Pendulum_PG(Preset): +class Pendulum_PG(conf.Preset): def __init__(self): - Preset.__init__(self, PolicyGradient, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.PolicyGradient, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'Pendulum-v0' self.agent.policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP' self.learning_rate = 0.001 @@ -554,9 +560,9 @@ class Pendulum_PG(Preset): self.agent.apply_gradients_every_x_episodes = 10 -class Pendulum_DDPG(Preset): +class Pendulum_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.DDPG, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'Pendulum-v0' self.learning_rate = 0.001 self.num_heatup_steps = 1000 @@ -567,18 +573,18 @@ class Pendulum_DDPG(Preset): self.test_min_return_threshold = -250 -class InvertedPendulum_DDPG(Preset): +class InvertedPendulum_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, GymVectorObservation, OUExploration) + conf.Preset.__init__(self, conf.DDPG, conf.GymVectorObservation, conf.OUExploration) self.env.level = 'InvertedPendulum-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 100 self.env.normalize_observation = True -class InvertedPendulum_PPO(Preset): +class InvertedPendulum_PPO(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.PPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'InvertedPendulum-v1' self.learning_rate = 0.001 self.num_heatup_steps = 0 @@ -595,9 +601,9 @@ class InvertedPendulum_PPO(Preset): self.env.normalize_observation = True -class Pendulum_ClippedPPO(Preset): +class Pendulum_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Pendulum-v0' self.learning_rate = 0.00005 self.num_heatup_steps = 0 @@ -613,9 +619,9 @@ class Pendulum_ClippedPPO(Preset): self.agent.beta_entropy = 0.01 -class Hopper_DPPO(Preset): +class Hopper_DPPO(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.PPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Hopper-v1' self.learning_rate = 0.00001 self.num_heatup_steps = 0 @@ -631,9 +637,9 @@ class Hopper_DPPO(Preset): self.env.normalize_observation = True -class InvertedPendulum_ClippedPPO(Preset): +class InvertedPendulum_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'InvertedPendulum-v1' self.learning_rate = 0.00005 self.num_heatup_steps = 0 @@ -647,9 +653,9 @@ class InvertedPendulum_ClippedPPO(Preset): self.agent.optimizer_type = 'Adam' self.env.normalize_observation = True -class Humanoid_ClippedPPO(Preset): +class Humanoid_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Humanoid-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -664,9 +670,9 @@ class Humanoid_ClippedPPO(Preset): self.env.normalize_observation = True -class Hopper_ClippedPPO(Preset): +class Hopper_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Hopper-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -681,9 +687,9 @@ class Hopper_ClippedPPO(Preset): self.env.normalize_observation = True -class InvertedPendulum_ClippedPPO_Roboschool(Preset): +class InvertedPendulum_ClippedPPO_Roboschool(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, Roboschool, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.Roboschool, conf.ExplorationParameters) self.env.level = 'RoboschoolInvertedPendulum-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -698,9 +704,9 @@ class InvertedPendulum_ClippedPPO_Roboschool(Preset): self.env.normalize_observation = True -class HalfCheetah_ClippedPPO_Roboschool(Preset): +class HalfCheetah_ClippedPPO_Roboschool(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, Roboschool, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.Roboschool, conf.ExplorationParameters) self.env.level = 'RoboschoolHalfCheetah-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -715,9 +721,9 @@ class HalfCheetah_ClippedPPO_Roboschool(Preset): self.env.normalize_observation = True -class Hopper_ClippedPPO_Roboschool(Preset): +class Hopper_ClippedPPO_Roboschool(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, Roboschool, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.Roboschool, conf.ExplorationParameters) self.env.level = 'RoboschoolHopper-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -732,9 +738,9 @@ class Hopper_ClippedPPO_Roboschool(Preset): self.env.normalize_observation = True -class Ant_ClippedPPO(Preset): +class Ant_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Ant-v1' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -749,9 +755,9 @@ class Ant_ClippedPPO(Preset): self.env.normalize_observation = True -class Hopper_ClippedPPO_Distributed(Preset): +class Hopper_ClippedPPO_Distributed(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Hopper-v1' self.learning_rate = 0.00001 self.num_heatup_steps = 0 @@ -766,17 +772,17 @@ class Hopper_ClippedPPO_Distributed(Preset): self.env.normalize_observation = True -class Hopper_DDPG_Roboschool(Preset): +class Hopper_DDPG_Roboschool(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, Roboschool, OUExploration) + conf.Preset.__init__(self, conf.DDPG, conf.Roboschool, conf.OUExploration) self.env.level = 'RoboschoolHopper-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 100 -class Hopper_PPO_Roboschool(Preset): +class Hopper_PPO_Roboschool(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, Roboschool, ExplorationParameters) + conf.Preset.__init__(self, conf.PPO, conf.Roboschool, conf.ExplorationParameters) self.env.level = 'RoboschoolHopper-v1' self.learning_rate = 0.001 self.num_heatup_steps = 0 @@ -790,27 +796,27 @@ class Hopper_PPO_Roboschool(Preset): self.agent.optimizer_type = 'LBFGS' -class Hopper_DDPG(Preset): +class Hopper_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, GymVectorObservation, OUExploration) + conf.Preset.__init__(self, conf.DDPG, conf.GymVectorObservation, conf.OUExploration) self.env.level = 'Hopper-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 100 self.env.normalize_observation = True -class Hopper_DDDPG(Preset): +class Hopper_DDDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDDPG, GymVectorObservation, OUExploration) + conf.Preset.__init__(self, conf.DDDPG, conf.GymVectorObservation, conf.OUExploration) self.env.level = 'Hopper-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 100 self.env.normalize_observation = True -class Hopper_PPO(Preset): +class Hopper_PPO(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.PPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'Hopper-v1' self.learning_rate = 0.001 self.num_heatup_steps = 0 @@ -826,9 +832,9 @@ class Hopper_PPO(Preset): self.env.normalize_observation = True -class Walker_PPO(Preset): +class Walker_PPO(conf.Preset): def __init__(self): - Preset.__init__(self, PPO, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.PPO, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'Walker2d-v1' self.learning_rate = 0.001 self.num_heatup_steps = 0 @@ -843,27 +849,27 @@ class Walker_PPO(Preset): self.env.normalize_observation = True -class HalfCheetah_DDPG(Preset): +class HalfCheetah_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, GymVectorObservation, OUExploration) + conf.Preset.__init__(self, conf.DDPG, conf.GymVectorObservation, conf.OUExploration) self.env.level = 'HalfCheetah-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 1000 self.env.normalize_observation = True -class Ant_DDPG(Preset): +class Ant_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, GymVectorObservation, OUExploration) + conf.Preset.__init__(self, conf.DDPG, conf.GymVectorObservation, conf.OUExploration) self.env.level = 'Ant-v1' self.learning_rate = 0.00025 self.num_heatup_steps = 1000 self.env.normalize_observation = True -class Pendulum_NAF(Preset): +class Pendulum_NAF(conf.Preset): def __init__(self): - Preset.__init__(self, NAF, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.NAF, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'Pendulum-v0' self.learning_rate = 0.001 self.num_heatup_steps = 1000 @@ -875,18 +881,18 @@ class Pendulum_NAF(Preset): self.test_min_return_threshold = -250 -class InvertedPendulum_NAF(Preset): +class InvertedPendulum_NAF(conf.Preset): def __init__(self): - Preset.__init__(self, NAF, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.NAF, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'InvertedPendulum-v1' self.learning_rate = 0.001 self.num_heatup_steps = 1000 self.batch_size = 100 -class Hopper_NAF(Preset): +class Hopper_NAF(conf.Preset): def __init__(self): - Preset.__init__(self, NAF, GymVectorObservation, AdditiveNoiseExploration) + conf.Preset.__init__(self, conf.NAF, conf.GymVectorObservation, conf.AdditiveNoiseExploration) self.env.level = 'Hopper-v1' self.learning_rate = 0.0005 self.num_heatup_steps = 1000 @@ -895,9 +901,9 @@ class Hopper_NAF(Preset): self.env.normalize_observation = True -class CartPole_NEC(Preset): +class CartPole_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'CartPole-v0' self.learning_rate = 0.00025 self.agent.num_episodes_in_experience_replay = 200 @@ -912,9 +918,9 @@ class CartPole_NEC(Preset): self.test_min_return_threshold = 150 -class Doom_Basic_NEC(Preset): +class Doom_Basic_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.learning_rate = 0.00001 self.agent.num_transitions_in_experience_replay = 100000 @@ -928,9 +934,9 @@ class Doom_Basic_NEC(Preset): -class Montezuma_NEC(Preset): +class Montezuma_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Atari, conf.ExplorationParameters) self.env.level = 'MontezumaRevenge-v0' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -938,9 +944,9 @@ class Montezuma_NEC(Preset): self.agent.num_playing_steps_between_two_training_steps = 1 -class Breakout_NEC(Preset): +class Breakout_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Atari, conf.ExplorationParameters) self.env.level = 'BreakoutDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00001 @@ -958,9 +964,9 @@ class Breakout_NEC(Preset): self.seed = 123 -class Doom_Health_NEC(Preset): +class Doom_Health_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Doom, conf.ExplorationParameters) self.env.level = 'HEALTH_GATHERING' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -969,9 +975,9 @@ class Doom_Health_NEC(Preset): self.agent.num_playing_steps_between_two_training_steps = 1 -class Doom_Health_DQN(Preset): +class Doom_Health_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Doom, conf.ExplorationParameters) self.env.level = 'HEALTH_GATHERING' self.agent.num_episodes_in_experience_replay = 200 self.learning_rate = 0.00025 @@ -980,22 +986,22 @@ class Doom_Health_DQN(Preset): self.agent.num_steps_between_copying_online_weights_to_target = 1000 -class Pong_NEC_LSTM(Preset): +class Pong_NEC_LSTM(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Atari, conf.ExplorationParameters) self.env.level = 'PongDeterministic-v4' self.learning_rate = 0.001 self.agent.num_transitions_in_experience_replay = 1000000 - self.agent.middleware_type = MiddlewareTypes.LSTM + self.agent.middleware_type = conf.MiddlewareTypes.LSTM self.exploration.initial_epsilon = 0.5 self.exploration.final_epsilon = 0.1 self.exploration.epsilon_decay_steps = 1000000 self.num_heatup_steps = 500 -class Pong_NEC(Preset): +class Pong_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Atari, conf.ExplorationParameters) self.env.level = 'PongDeterministic-v4' self.learning_rate = 0.00001 self.agent.num_transitions_in_experience_replay = 100000 @@ -1012,9 +1018,9 @@ class Pong_NEC(Preset): # self.seed = 123 -class Alien_NEC(Preset): +class Alien_NEC(conf.Preset): def __init__(self): - Preset.__init__(self, NEC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.NEC, conf.Atari, conf.ExplorationParameters) self.env.level = 'AlienDeterministic-v4' self.learning_rate = 0.0001 self.agent.num_transitions_in_experience_replay = 100000 @@ -1029,9 +1035,9 @@ class Alien_NEC(Preset): self.seed = 123 -class Pong_DQN(Preset): +class Pong_DQN(conf.Preset): def __init__(self): - Preset.__init__(self, DQN, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.DQN, conf.Atari, conf.ExplorationParameters) self.env.level = 'PongDeterministic-v4' self.agent.num_steps_between_copying_online_weights_to_target = 10000 self.learning_rate = 0.00025 @@ -1047,9 +1053,9 @@ class Pong_DQN(Preset): self.seed = 123 -class CartPole_A3C(Preset): +class CartPole_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'CartPole-v0' self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 @@ -1060,7 +1066,7 @@ class CartPole_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.01 self.agent.num_steps_between_gradient_updates = 5 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC self.test = True self.test_max_step_threshold = 1000 @@ -1068,9 +1074,9 @@ class CartPole_A3C(Preset): self.test_num_workers = 8 -class MountainCar_A3C(Preset): +class MountainCar_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.CategoricalExploration) self.env.level = 'MountainCar-v0' self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 @@ -1081,12 +1087,12 @@ class MountainCar_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.01 self.agent.num_steps_between_gradient_updates = 5 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class InvertedPendulum_A3C(Preset): +class InvertedPendulum_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'InvertedPendulum-v1' self.agent.policy_gradient_rescaler = 'A_VALUE' self.agent.optimizer_type = 'Adam' @@ -1099,12 +1105,12 @@ class InvertedPendulum_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.005 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Hopper_A3C(Preset): +class Hopper_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'Hopper-v1' self.agent.policy_gradient_rescaler = 'GAE' self.agent.optimizer_type = 'Adam' @@ -1117,7 +1123,7 @@ class Hopper_A3C(Preset): self.agent.gae_lambda = 0.98 self.agent.beta_entropy = 0.005 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC class HopperIceWall_A3C(Hopper_A3C): @@ -1138,9 +1144,9 @@ class HopperBullet_A3C(Hopper_A3C): self.env.level = 'HopperBulletEnv-v0' -class Kuka_ClippedPPO(Preset): +class Kuka_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'KukaBulletEnv-v0' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -1155,9 +1161,9 @@ class Kuka_ClippedPPO(Preset): self.env.normalize_observation = True -class Minitaur_ClippedPPO(Preset): +class Minitaur_ClippedPPO(conf.Preset): def __init__(self): - Preset.__init__(self, ClippedPPO, GymVectorObservation, ExplorationParameters) + conf.Preset.__init__(self, conf.ClippedPPO, conf.GymVectorObservation, conf.ExplorationParameters) self.env.level = 'MinitaurBulletEnv-v0' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -1172,9 +1178,9 @@ class Minitaur_ClippedPPO(Preset): self.env.normalize_observation = True -class Walker_A3C(Preset): +class Walker_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'Walker2d-v1' self.agent.policy_gradient_rescaler = 'A_VALUE' self.agent.optimizer_type = 'Adam' @@ -1187,12 +1193,12 @@ class Walker_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.005 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Ant_A3C(Preset): +class Ant_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'Ant-v1' self.agent.policy_gradient_rescaler = 'A_VALUE' self.agent.optimizer_type = 'Adam' @@ -1205,7 +1211,7 @@ class Ant_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.005 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC self.env.normalize_observation = True @@ -1221,9 +1227,9 @@ class AntMaze_A3C(Ant_A3C): self.env.level = 'AntMaze-v0' -class Humanoid_A3C(Preset): +class Humanoid_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'Humanoid-v1' self.agent.policy_gradient_rescaler = 'A_VALUE' self.agent.optimizer_type = 'Adam' @@ -1236,13 +1242,13 @@ class Humanoid_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.005 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC self.env.normalize_observation = True -class Pendulum_A3C(Preset): +class Pendulum_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'Pendulum-v0' self.agent.policy_gradient_rescaler = 'GAE' self.agent.optimizer_type = 'Adam' @@ -1254,9 +1260,9 @@ class Pendulum_A3C(Preset): -class BipedalWalker_A3C(Preset): +class BipedalWalker_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, GymVectorObservation, EntropyExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.GymVectorObservation, conf.EntropyExploration) self.env.level = 'BipedalWalker-v2' self.agent.policy_gradient_rescaler = 'A_VALUE' self.agent.optimizer_type = 'RMSProp' @@ -1269,12 +1275,12 @@ class BipedalWalker_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.005 self.clip_gradients = None - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Doom_Basic_A3C(Preset): +class Doom_Basic_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, Doom, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.Doom, conf.CategoricalExploration) self.env.level = 'basic' self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 @@ -1286,12 +1292,12 @@ class Doom_Basic_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.01 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Pong_A3C(Preset): +class Pong_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, Atari, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.Atari, conf.CategoricalExploration) self.env.level = 'PongDeterministic-v4' self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 @@ -1302,12 +1308,12 @@ class Pong_A3C(Preset): self.agent.gae_lambda = 1. self.agent.beta_entropy = 0.01 self.clip_gradients = 40.0 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Breakout_A3C(Preset): +class Breakout_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, Atari, CategoricalExploration) + conf.Preset.__init__(self, conf.ActorCritic, conf.Atari, conf.CategoricalExploration) self.env.level = 'BreakoutDeterministic-v4' self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 @@ -1318,13 +1324,13 @@ class Breakout_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.05 self.clip_gradients = 40.0 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Carla_A3C(Preset): +class Carla_A3C(conf.Preset): def __init__(self): - Preset.__init__(self, ActorCritic, Carla, EntropyExploration) - self.agent.embedder_complexity = EmbedderComplexity.Deep + conf.Preset.__init__(self, conf.ActorCritic, conf.Carla, conf.EntropyExploration) + self.agent.embedder_complexity = conf.EmbedderComplexity.Deep self.agent.policy_gradient_rescaler = 'GAE' self.learning_rate = 0.0001 self.num_heatup_steps = 0 @@ -1335,22 +1341,22 @@ class Carla_A3C(Preset): self.agent.gae_lambda = 1 self.agent.beta_entropy = 0.01 self.clip_gradients = 40 - self.agent.middleware_type = MiddlewareTypes.FC + self.agent.middleware_type = conf.MiddlewareTypes.FC -class Carla_DDPG(Preset): +class Carla_DDPG(conf.Preset): def __init__(self): - Preset.__init__(self, DDPG, Carla, OUExploration) - self.agent.embedder_complexity = EmbedderComplexity.Deep + conf.Preset.__init__(self, conf.DDPG, conf.Carla, conf.OUExploration) + self.agent.embedder_complexity = conf.EmbedderComplexity.Deep self.learning_rate = 0.0001 self.num_heatup_steps = 1000 self.agent.num_consecutive_training_steps = 5 -class Carla_BC(Preset): +class Carla_BC(conf.Preset): def __init__(self): - Preset.__init__(self, BC, Carla, ExplorationParameters) - self.agent.embedder_complexity = EmbedderComplexity.Deep + conf.Preset.__init__(self, conf.BC, conf.Carla, conf.ExplorationParameters) + self.agent.embedder_complexity = conf.EmbedderComplexity.Deep self.agent.load_memory_from_file_path = 'datasets/carla_town1.p' self.learning_rate = 0.0005 self.num_heatup_steps = 0 @@ -1359,9 +1365,9 @@ class Carla_BC(Preset): self.evaluate_every_x_training_iterations = 5000 -class Doom_Basic_BC(Preset): +class Doom_Basic_BC(conf.Preset): def __init__(self): - Preset.__init__(self, BC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.BC, conf.Doom, conf.ExplorationParameters) self.env.level = 'basic' self.agent.load_memory_from_file_path = 'datasets/doom_basic.p' self.learning_rate = 0.0005 @@ -1372,9 +1378,9 @@ class Doom_Basic_BC(Preset): self.num_training_iterations = 2000 -class Doom_Defend_BC(Preset): +class Doom_Defend_BC(conf.Preset): def __init__(self): - Preset.__init__(self, BC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.BC, conf.Doom, conf.ExplorationParameters) self.env.level = 'defend' self.agent.load_memory_from_file_path = 'datasets/doom_defend.p' self.learning_rate = 0.0005 @@ -1384,9 +1390,9 @@ class Doom_Defend_BC(Preset): self.evaluate_every_x_training_iterations = 100 -class Doom_Deathmatch_BC(Preset): +class Doom_Deathmatch_BC(conf.Preset): def __init__(self): - Preset.__init__(self, BC, Doom, ExplorationParameters) + conf.Preset.__init__(self, conf.BC, conf.Doom, conf.ExplorationParameters) self.env.level = 'deathmatch' self.agent.load_memory_from_file_path = 'datasets/doom_deathmatch.p' self.learning_rate = 0.0005 @@ -1396,9 +1402,9 @@ class Doom_Deathmatch_BC(Preset): self.evaluate_every_x_training_iterations = 100 -class MontezumaRevenge_BC(Preset): +class MontezumaRevenge_BC(conf.Preset): def __init__(self): - Preset.__init__(self, BC, Atari, ExplorationParameters) + conf.Preset.__init__(self, conf.BC, conf.Atari, conf.ExplorationParameters) self.env.level = 'MontezumaRevenge-v0' self.agent.load_memory_from_file_path = 'datasets/montezuma_revenge.p' self.learning_rate = 0.0005 diff --git a/renderer.py b/renderer.py index fee19af..2a1637f 100644 --- a/renderer.py +++ b/renderer.py @@ -1,6 +1,6 @@ -import pygame -from pygame.locals import * import numpy as np +import pygame +from pygame import locals as loc class Renderer(object): @@ -21,7 +21,8 @@ class Renderer(object): :return: None """ self.size = (width, height) - self.screen = self.display.set_mode(self.size, HWSURFACE | DOUBLEBUF) + self.screen = self.display.set_mode(self.size, + loc.HWSURFACE | loc.DOUBLEBUF) self.display.set_caption("Coach") self.is_open = True diff --git a/run_test.py b/run_test.py index 196056e..36cb2c6 100644 --- a/run_test.py +++ b/run_test.py @@ -13,23 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -# -*- coding: utf-8 -*- -import presets -import numpy as np -import pandas as pd -from os import path -import os +import argparse import glob +import os import shutil +import signal +import subprocess import sys import time -from logger import screen -from utils import list_all_classes_in_module, threaded_cmd_line_run, killed_processes -import subprocess -import signal -import argparse +import numpy as np +import pandas as pd + +import logger +import presets +import utils if __name__ == '__main__': parser = argparse.ArgumentParser() @@ -61,7 +59,7 @@ if __name__ == '__main__': if args.preset is not None: presets_lists = [args.preset] else: - presets_lists = list_all_classes_in_module(presets) + presets_lists = utils.list_all_classes_in_module(presets) win_size = 10 fail_count = 0 test_count = 0 @@ -70,7 +68,7 @@ if __name__ == '__main__': # create a clean experiment directory test_name = '__test' test_path = os.path.join('./experiments', test_name) - if path.exists(test_path): + if os.path.exists(test_path): shutil.rmtree(test_path) if args.ignore_presets is not None: presets_to_ignore = args.ignore_presets.split(',') @@ -100,7 +98,7 @@ if __name__ == '__main__': test_count += 1 # run the experiment in a separate thread - screen.log_title("Running test {} - {}".format(preset_name, framework)) + logger.screen.log_title("Running test {} - {}".format(preset_name, framework)) log_file_name = 'test_log_{preset_name}_{framework}.txt'.format( preset_name=preset_name, framework=framework, @@ -139,7 +137,7 @@ if __name__ == '__main__': tries_counter = 0 while not csv_paths: - csv_paths = glob.glob(path.join(test_path, '*', filename_pattern)) + csv_paths = glob.glob(os.path.join(test_path, '*', filename_pattern)) if tries_counter > read_csv_tries: break tries_counter += 1 @@ -195,26 +193,26 @@ if __name__ == '__main__': # kill test and print result os.killpg(os.getpgid(p.pid), signal.SIGTERM) if test_passed: - screen.success("Passed successfully") + logger.screen.success("Passed successfully") else: if csv_paths: - screen.error("Failed due to insufficient reward", crash=False) - screen.error("preset.test_max_step_threshold: {}".format(preset.test_max_step_threshold), crash=False) - screen.error("preset.test_min_return_threshold: {}".format(preset.test_min_return_threshold), crash=False) - screen.error("averaged_rewards: {}".format(averaged_rewards), crash=False) - screen.error("episode number: {}".format(csv['Episode #'].values[-1]), crash=False) + logger.screen.error("Failed due to insufficient reward", crash=False) + logger.screen.error("preset.test_max_step_threshold: {}".format(preset.test_max_step_threshold), crash=False) + logger.screen.error("preset.test_min_return_threshold: {}".format(preset.test_min_return_threshold), crash=False) + logger.screen.error("averaged_rewards: {}".format(averaged_rewards), crash=False) + logger.screen.error("episode number: {}".format(csv['Episode #'].values[-1]), crash=False) else: - screen.error("csv file never found", crash=False) + logger.screen.error("csv file never found", crash=False) if args.verbose: - screen.error("command exitcode: {}".format(p.returncode), crash=False) - screen.error(open(log_file_name).read(), crash=False) + logger.screen.error("command exitcode: {}".format(p.returncode), crash=False) + logger.screen.error(open(log_file_name).read(), crash=False) fail_count += 1 shutil.rmtree(test_path) - screen.separator() + logger.screen.separator() if fail_count == 0: - screen.success(" Summary: " + str(test_count) + "/" + str(test_count) + " tests passed successfully") + logger.screen.success(" Summary: " + str(test_count) + "/" + str(test_count) + " tests passed successfully") else: - screen.error(" Summary: " + str(test_count - fail_count) + "/" + str(test_count) + " tests passed successfully") + logger.screen.error(" Summary: " + str(test_count - fail_count) + "/" + str(test_count) + " tests passed successfully") diff --git a/utils.py b/utils.py index 7f75ac5..f6cfc7a 100644 --- a/utils.py +++ b/utils.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import json -import inspect -import os -import numpy as np -import threading -from subprocess import call, Popen -import signal import copy +import inspect +import json +import os +import signal +import subprocess +import threading + +import numpy as np + killed_processes = [] eps = np.finfo(np.float32).eps + class Enum(object): def __init__(self): pass @@ -161,7 +163,7 @@ def ClassToDict(x): def cmd_line_run(result, run_cmd, id=-1): - p = Popen(run_cmd, shell=True, executable="/bin/bash") + p = subprocess.Popen(run_cmd, shell=True, executable="/bin/bash") while result[0] is None or result[0] == [None]: if id in killed_processes: p.kill()