diff --git a/agents/__init__.py b/agents/__init__.py index fdbd13e..6be2538 100644 --- a/agents/__init__.py +++ b/agents/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,26 +13,48 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from agents.actor_critic_agent import ActorCriticAgent +from agents.agent import Agent +from agents.bc_agent import BCAgent +from agents.bootstrapped_dqn_agent import BootstrappedDQNAgent +from agents.categorical_dqn_agent import CategoricalDQNAgent +from agents.clipped_ppo_agent import ClippedPPOAgent +from agents.ddpg_agent import DDPGAgent +from agents.ddqn_agent import DDQNAgent +from agents.dfp_agent import DFPAgent +from agents.dqn_agent import DQNAgent +from agents.human_agent import HumanAgent +from agents.imitation_agent import ImitationAgent +from agents.mmc_agent import MixedMonteCarloAgent +from agents.n_step_q_agent import NStepQAgent +from agents.naf_agent import NAFAgent +from agents.nec_agent import NECAgent +from agents.pal_agent import PALAgent +from agents.policy_gradients_agent import PolicyGradientsAgent +from agents.policy_optimization_agent import PolicyOptimizationAgent +from agents.ppo_agent import PPOAgent +from agents.qr_dqn_agent import QuantileRegressionDQNAgent +from agents.value_optimization_agent import ValueOptimizationAgent -from agents.actor_critic_agent import * -from agents.agent import * -from agents.bc_agent import * -from agents.bootstrapped_dqn_agent import * -from agents.clipped_ppo_agent import * -from agents.ddpg_agent import * -from agents.ddqn_agent import * -from agents.dfp_agent import * -from agents.dqn_agent import * -from agents.categorical_dqn_agent import * -from agents.human_agent import * -from agents.imitation_agent import * -from agents.mmc_agent import * -from agents.n_step_q_agent import * -from agents.naf_agent import * -from agents.nec_agent import * -from agents.pal_agent import * -from agents.policy_gradients_agent import * -from agents.policy_optimization_agent import * -from agents.ppo_agent import * -from agents.value_optimization_agent import * -from agents.qr_dqn_agent import * +__all__ = [ActorCriticAgent, + Agent, + BCAgent, + BootstrappedDQNAgent, + CategoricalDQNAgent, + ClippedPPOAgent, + DDPGAgent, + DDQNAgent, + DFPAgent, + DQNAgent, + HumanAgent, + ImitationAgent, + MixedMonteCarloAgent, + NAFAgent, + NECAgent, + NStepQAgent, + PALAgent, + PPOAgent, + PolicyGradientsAgent, + PolicyOptimizationAgent, + QuantileRegressionDQNAgent, + ValueOptimizationAgent] diff --git a/agents/actor_critic_agent.py b/agents/actor_critic_agent.py index 729e67f..d514acd 100644 --- a/agents/actor_critic_agent.py +++ b/agents/actor_critic_agent.py @@ -13,23 +13,24 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np +from scipy import signal -from agents.policy_optimization_agent import * -from logger import * -from utils import * -import scipy.signal +from agents import policy_optimization_agent as poa +import utils +import logger # Actor Critic - https://arxiv.org/abs/1602.01783 -class ActorCriticAgent(PolicyOptimizationAgent): +class ActorCriticAgent(poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network = False): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network) + poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network) self.last_gradient_update_step_idx = 0 - self.action_advantages = Signal('Advantages') - self.state_values = Signal('Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') - self.policy_loss = Signal('Policy Loss') + self.action_advantages = utils.Signal('Advantages') + self.state_values = utils.Signal('Values') + self.unclipped_grads = utils.Signal('Grads (unclipped)') + self.value_loss = utils.Signal('Value Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.action_advantages) self.signals.append(self.state_values) self.signals.append(self.unclipped_grads) @@ -38,7 +39,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): # Discounting function used to calculate discounted returns. def discount(self, x, gamma): - return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] + return signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] def get_general_advantage_estimation_values(self, rewards, values): # values contain n+1 elements (t ... t+n+1), rewards contain n elements (t ... t + n) @@ -72,20 +73,20 @@ class ActorCriticAgent(PolicyOptimizationAgent): # estimate the advantage function action_advantages = np.zeros((num_transitions, 1)) - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: if game_overs[-1]: R = 0 else: - R = self.main_network.online_network.predict(last_sample(next_states))[0] + R = self.main_network.online_network.predict(utils.last_sample(next_states))[0] for i in reversed(range(num_transitions)): R = rewards[i] + self.tp.agent.discount * R state_value_head_targets[i] = R action_advantages[i] = R - current_state_values[i] - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps - bootstrapped_value = self.main_network.online_network.predict(last_sample(next_states))[0] + bootstrapped_value = self.main_network.online_network.predict(utils.last_sample(next_states))[0] values = np.append(current_state_values, bootstrapped_value) if game_overs[-1]: values[-1] = 0 @@ -94,7 +95,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): gae_values, state_value_head_targets = self.get_general_advantage_estimation_values(rewards, values) action_advantages = np.vstack(gae_values) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") action_advantages = action_advantages.squeeze(axis=-1) if not self.env.discrete_controls and len(actions.shape) < 2: @@ -113,7 +114,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # TODO: rename curr_state -> state # convert to batch so we can run it through the network @@ -126,7 +127,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): # DISCRETE state_value, action_probabilities = self.main_network.online_network.predict(curr_state) action_probabilities = action_probabilities.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_probabilities) else: action = np.argmax(action_probabilities) @@ -137,7 +138,7 @@ class ActorCriticAgent(PolicyOptimizationAgent): state_value, action_values_mean, action_values_std = self.main_network.online_network.predict(curr_state) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) else: action = action_values_mean diff --git a/agents/agent.py b/agents/agent.py index 006544e..c1f5262 100644 --- a/agents/agent.py +++ b/agents/agent.py @@ -13,32 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import scipy.ndimage -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") - -import copy -from renderer import Renderer -from configurations import Preset -from collections import deque -from utils import LazyStack -from collections import OrderedDict -from utils import RunPhase, Signal, is_empty, RunningStat -from architectures import * -from exploration_policies import * -from memories import * -from memories.memory import * -from logger import logger, screen +import collections import random import time -import os -import itertools -from architectures.tensorflow_components.shared_variables import SharedRunningStats + +import logger +try: + import matplotlib.pyplot as plt +except ImportError: + logger.failed_imports.append("matplotlib") + +import numpy as np +from pandas.io import pickle from six.moves import range +import scipy + +from architectures.tensorflow_components import shared_variables as sv +import configurations +import exploration_policies as ep +import memories +from memories import memory +import renderer +import utils class Agent(object): @@ -54,7 +50,7 @@ class Agent(object): :param thread_id: int """ - screen.log_title("Creating agent {}".format(task_id)) + logger.screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env @@ -71,21 +67,20 @@ class Agent(object): # modules if tuning_parameters.agent.load_memory_from_file_path: - screen.log_title("Loading replay buffer from pickle. Pickle path: {}" + logger.screen.log_title("Loading replay buffer from pickle. Pickle path: {}" .format(tuning_parameters.agent.load_memory_from_file_path)) - self.memory = read_pickle(tuning_parameters.agent.load_memory_from_file_path) + self.memory = pickle.read_pickle(tuning_parameters.agent.load_memory_from_file_path) else: - self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') - # self.architecture = eval(tuning_parameters.architecture) + self.memory = eval('memories.' + tuning_parameters.memory + '(tuning_parameters)') self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format(task_id) if replicated_device is not None else "/gpu:0" - self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') - self.evaluation_exploration_policy = eval(tuning_parameters.exploration.evaluation_policy + self.exploration_policy = eval('ep.' + tuning_parameters.exploration.policy + '(tuning_parameters)') + self.evaluation_exploration_policy = eval('ep.' + tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') - self.evaluation_exploration_policy.change_phase(RunPhase.TEST) + self.evaluation_exploration_policy.change_phase(utils.RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters @@ -100,30 +95,30 @@ class Agent(object): self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] - logger.set_current_time(self.current_episode) + logger.logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] - self.renderer = Renderer() + self.renderer = renderer.Renderer() # signals self.signals = [] - self.loss = Signal('Loss') + self.loss = utils.Signal('Loss') self.signals.append(self.loss) - self.curr_learning_rate = Signal('Learning Rate') + self.curr_learning_rate = utils.Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: - self.running_observation_stats = RunningStat((self.tp.env.desired_observation_width,)) - self.running_reward_stats = RunningStat(()) + self.running_observation_stats = utils.RunningStat((self.tp.env.desired_observation_width,)) + self.running_reward_stats = utils.RunningStat(()) else: - self.running_observation_stats = SharedRunningStats(self.tp, replicated_device, - shape=(self.tp.env.desired_observation_width,), - name='observation_stats') - self.running_reward_stats = SharedRunningStats(self.tp, replicated_device, - shape=(), - name='reward_stats') + self.running_observation_stats = sv.SharedRunningStats(self.tp, replicated_device, + shape=(self.tp.env.desired_observation_width,), + name='observation_stats') + self.running_reward_stats = sv.SharedRunningStats(self.tp, replicated_device, + shape=(), + name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done @@ -137,13 +132,13 @@ class Agent(object): def log_to_screen(self, phase): # log to screen if self.current_episode >= 0: - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: exploration = self.exploration_policy.get_control_param() else: exploration = self.evaluation_exploration_policy.get_control_param() - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), @@ -154,37 +149,37 @@ class Agent(object): prefix=phase ) - def update_log(self, phase=RunPhase.TRAIN): + def update_log(self, phase=utils.RunPhase.TRAIN): """ Writes logging messages to screen and updates the log file with all the signal values. :return: None """ # log all the signals to file - logger.set_current_time(self.current_episode) - logger.create_signal_value('Training Iter', self.training_iteration) - logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP)) - logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) - logger.create_signal_value('ER #Episodes', self.memory.length()) - logger.create_signal_value('Episode Length', self.current_episode_steps_counter) - logger.create_signal_value('Total steps', self.total_steps_counter) - logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) - logger.create_signal_value("Training Reward", self.total_reward_in_current_episode - if phase == RunPhase.TRAIN else np.nan) - logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode - if phase == RunPhase.TEST else np.nan) - logger.create_signal_value('Update Target Network', 0, overwrite=False) - logger.update_wall_clock_time(self.current_episode) + logger.logger.set_current_time(self.current_episode) + logger.logger.create_signal_value('Training Iter', self.training_iteration) + logger.logger.create_signal_value('In Heatup', int(phase == utils.RunPhase.HEATUP)) + logger.logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) + logger.logger.create_signal_value('ER #Episodes', self.memory.length()) + logger.logger.create_signal_value('Episode Length', self.current_episode_steps_counter) + logger.logger.create_signal_value('Total steps', self.total_steps_counter) + logger.logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) + logger.logger.create_signal_value("Training Reward", self.total_reward_in_current_episode + if phase == utils.RunPhase.TRAIN else np.nan) + logger.logger.create_signal_value('Evaluation Reward', self.total_reward_in_current_episode + if phase == utils.RunPhase.TEST else np.nan) + logger.logger.create_signal_value('Update Target Network', 0, overwrite=False) + logger.logger.update_wall_clock_time(self.current_episode) for signal in self.signals: - logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) - logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) - logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) - logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) + logger.logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) + logger.logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) + logger.logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) + logger.logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) # dump if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \ and self.current_episode > 0: - logger.dump_output_csv() + logger.logger.dump_output_csv() def reset_game(self, do_not_reset_env=False): """ @@ -211,7 +206,7 @@ class Agent(object): self.episode_running_info[action] = [] plt.clf() - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == configurations.MiddlewareTypes.LSTM: for network in self.networks: network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init @@ -281,9 +276,9 @@ class Agent(object): if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: for network in self.networks: network.update_target_network(self.tp.agent.rate_for_copying_weights_to_target) - logger.create_signal_value('Update Target Network', 1) + logger.logger.create_signal_value('Update Target Network', 1) else: - logger.create_signal_value('Update Target Network', 0, overwrite=False) + logger.logger.create_signal_value('Update Target Network', 0, overwrite=False) return loss @@ -321,7 +316,7 @@ class Agent(object): plt.legend() plt.pause(0.00000001) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): """ choose an action to act with in the current episode being played. Different behavior might be exhibited when training or testing. @@ -351,15 +346,15 @@ class Agent(object): for input_name in self.tp.agent.input_types.keys(): input_state[input_name] = np.expand_dims(np.array(curr_state[input_name]), 0) return input_state - + def prepare_initial_state(self): """ Create an initial state when starting a new episode :return: None """ observation = self.preprocess_observation(self.env.state['observation']) - self.curr_stack = deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) - observation = LazyStack(self.curr_stack, -1) + self.curr_stack = collections.deque([observation]*self.tp.env.observation_stack_size, maxlen=self.tp.env.observation_stack_size) + observation = utils.LazyStack(self.curr_stack, -1) self.curr_state = { 'observation': observation @@ -369,21 +364,21 @@ class Agent(object): if self.tp.agent.use_accumulated_reward_as_measurement: self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) - def act(self, phase=RunPhase.TRAIN): + def act(self, phase=utils.RunPhase.TRAIN): """ Take one step in the environment according to the network prediction and store the transition in memory :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored :return: A boolean value that signals an episode termination """ - if phase != RunPhase.TEST: + if phase != utils.RunPhase.TEST: self.total_steps_counter += 1 self.current_episode_steps_counter += 1 # get new action action_info = {"action_probability": 1.0 / self.env.action_space_size, "action_value": 0, "max_action_value": 0} - if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: + if phase == utils.RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: action = self.env.get_random_action() else: action, action_info = self.choose_action(self.curr_state, phase=phase) @@ -402,13 +397,13 @@ class Agent(object): next_state['observation'] = self.preprocess_observation(next_state['observation']) # plot action values online - if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: + if self.tp.visualization.plot_action_values_online and phase != utils.RunPhase.HEATUP: self.plot_action_values_online() # initialize the next state # TODO: provide option to stack more than just the observation self.curr_stack.append(next_state['observation']) - observation = LazyStack(self.curr_stack, -1) + observation = utils.LazyStack(self.curr_stack, -1) next_state['observation'] = observation if self.tp.agent.use_measurements and 'measurements' in result.keys(): @@ -417,14 +412,14 @@ class Agent(object): next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) # store the transition only if we are training - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: - transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) + if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP: + transition = memory.Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) for key in action_info.keys(): transition.info[key] = action_info[key] if self.tp.agent.add_a_normalized_timestep_to_the_observation: transition.info['timestep'] = float(self.current_episode_steps_counter) / self.env.timestep_limit self.memory.store(transition) - elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs: + elif phase == utils.RunPhase.TEST and self.tp.visualization.dump_gifs: # we store the transitions only for saving gifs self.last_episode_images.append(self.env.get_rendered_image()) @@ -437,7 +432,7 @@ class Agent(object): self.update_log(phase=phase) self.log_to_screen(phase=phase) - if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: + if phase == utils.RunPhase.TRAIN or phase == utils.RunPhase.HEATUP: self.reset_game() self.current_episode += 1 @@ -456,8 +451,8 @@ class Agent(object): max_reward_achieved = -float('inf') average_evaluation_reward = 0 - screen.log_title("Running evaluation") - self.env.change_phase(RunPhase.TEST) + logger.screen.log_title("Running evaluation") + self.env.change_phase(utils.RunPhase.TEST) for i in range(num_episodes): # keep the online network in sync with the global network if keep_networks_synced: @@ -466,7 +461,7 @@ class Agent(object): episode_ended = False while not episode_ended: - episode_ended = self.act(phase=RunPhase.TEST) + episode_ended = self.act(phase=utils.RunPhase.TEST) if keep_networks_synced \ and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps: @@ -477,7 +472,7 @@ class Agent(object): max_reward_achieved = self.total_reward_in_current_episode frame_skipping = int(5/self.tp.env.frame_skip) if self.tp.visualization.dump_gifs: - logger.create_gif(self.last_episode_images[::frame_skipping], + logger.logger.create_gif(self.last_episode_images[::frame_skipping], name='score-{}'.format(max_reward_achieved), fps=10) average_evaluation_reward += self.total_reward_in_current_episode @@ -485,8 +480,8 @@ class Agent(object): average_evaluation_reward /= float(num_episodes) - self.env.change_phase(RunPhase.TRAIN) - screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward)) + self.env.change_phase(utils.RunPhase.TRAIN) + logger.screen.log_title("Evaluation done. Average reward = {}.".format(average_evaluation_reward)) def post_training_commands(self): pass @@ -505,15 +500,15 @@ class Agent(object): # heatup phase if self.tp.num_heatup_steps != 0: self.in_heatup = True - screen.log_title("Starting heatup {}".format(self.task_id)) + logger.screen.log_title("Starting heatup {}".format(self.task_id)) num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size for step in range(max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)): - self.act(phase=RunPhase.HEATUP) + self.act(phase=utils.RunPhase.HEATUP) # training phase self.in_heatup = False - screen.log_title("Starting training {}".format(self.task_id)) - self.exploration_policy.change_phase(RunPhase.TRAIN) + logger.screen.log_title("Starting training {}".format(self.task_id)) + self.exploration_policy.change_phase(utils.RunPhase.TRAIN) training_start_time = time.time() model_snapshots_periods_passed = -1 self.reset_game() @@ -557,7 +552,7 @@ class Agent(object): self.loss.add_sample(loss) self.training_iteration += 1 if self.imitation: - self.log_to_screen(RunPhase.TRAIN) + self.log_to_screen(utils.RunPhase.TRAIN) self.post_training_commands() def save_model(self, model_id): diff --git a/agents/bc_agent.py b/agents/bc_agent.py index 70fe3e6..af01720 100644 --- a/agents/bc_agent.py +++ b/agents/bc_agent.py @@ -13,16 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from agents.imitation_agent import ImitationAgent +from agents import imitation_agent # Behavioral Cloning Agent -class BCAgent(ImitationAgent): +class BCAgent(imitation_agent.ImitationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + imitation_agent.ImitationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, _, actions, _, _, _ = self.extract_batch(batch) diff --git a/agents/bootstrapped_dqn_agent.py b/agents/bootstrapped_dqn_agent.py index 3476022..41aea9f 100644 --- a/agents/bootstrapped_dqn_agent.py +++ b/agents/bootstrapped_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,17 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * - +from agents import value_optimization_agent as voa +import utils # Bootstrapped DQN - https://arxiv.org/pdf/1602.04621.pdf -class BootstrappedDQNAgent(ValueOptimizationAgent): +class BootstrappedDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def reset_game(self, do_not_reset_env=False): - ValueOptimizationAgent.reset_game(self, do_not_reset_env) + voa.ValueOptimizationAgent.reset_game(self, do_not_reset_env) self.exploration_policy.select_head() def learn_from_batch(self, batch): @@ -51,8 +52,8 @@ class BootstrappedDQNAgent(ValueOptimizationAgent): return total_loss - def act(self, phase=RunPhase.TRAIN): - ValueOptimizationAgent.act(self, phase) + def act(self, phase=utils.RunPhase.TRAIN): + voa.ValueOptimizationAgent.act(self, phase) mask = np.random.binomial(1, self.tp.exploration.bootstrapped_data_sharing_probability, self.tp.exploration.architecture_num_q_heads) self.memory.update_last_transition_info({'mask': mask}) diff --git a/agents/categorical_dqn_agent.py b/agents/categorical_dqn_agent.py index dec8ba2..8e442fc 100644 --- a/agents/categorical_dqn_agent.py +++ b/agents/categorical_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class CategoricalDQNAgent(ValueOptimizationAgent): +class CategoricalDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) @@ -57,4 +58,3 @@ class CategoricalDQNAgent(ValueOptimizationAgent): total_loss = result[0] return total_loss - diff --git a/agents/clipped_ppo_agent.py b/agents/clipped_ppo_agent.py index 88a70b0..07343c8 100644 --- a/agents/clipped_ppo_agent.py +++ b/agents/clipped_ppo_agent.py @@ -13,27 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from agents.actor_critic_agent import * +import collections +import copy from random import shuffle +import numpy as np + +from agents import actor_critic_agent as aca +from agents import policy_optimization_agent as poa +import logger +import utils + # Clipped Proximal Policy Optimization - https://arxiv.org/abs/1707.06347 -class ClippedPPOAgent(ActorCriticAgent): +class ClippedPPOAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) # signals definition - self.value_loss = Signal('Value Loss') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.policy_loss) self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') + self.unclipped_grads = utils.Signal('Grads (unclipped)') self.signals.append(self.unclipped_grads) - self.value_targets = Signal('Value Targets') + self.value_targets = utils.Signal('Value Targets') self.signals.append(self.value_targets) - self.kl_divergence = Signal('KL Divergence') + self.kl_divergence = utils.Signal('KL Divergence') self.signals.append(self.kl_divergence) def fill_advantages(self, batch): @@ -46,9 +53,9 @@ class ClippedPPOAgent(ActorCriticAgent): # calculate advantages advantages = [] value_targets = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) @@ -66,7 +73,7 @@ class ClippedPPOAgent(ActorCriticAgent): advantages = np.append(advantages, rollout_advantages) value_targets = np.append(value_targets, gae_based_value_targets) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) @@ -144,8 +151,8 @@ class ClippedPPOAgent(ActorCriticAgent): curr_learning_rate = self.tp.learning_rate # log training parameters - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), @@ -184,13 +191,13 @@ class ClippedPPOAgent(ActorCriticAgent): self.update_log() # should be done in order to update the data that has been accumulated * while not playing * return np.append(losses[0], losses[1]) - def choose_action(self, current_state, phase=RunPhase.TRAIN): + def choose_action(self, current_state, phase=utils.RunPhase.TRAIN): if self.env.discrete_controls: # DISCRETE _, action_values = self.main_network.online_network.predict(self.tf_input_state(current_state)) action_values = action_values.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -201,7 +208,7 @@ class ClippedPPOAgent(ActorCriticAgent): _, action_values_mean, action_values_std = self.main_network.online_network.predict(self.tf_input_state(current_state)) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) # if self.current_episode % 5 == 0 and self.current_episode_steps_counter < 5: # print action diff --git a/agents/ddpg_agent.py b/agents/ddpg_agent.py index 425f1de..df05395 100644 --- a/agents/ddpg_agent.py +++ b/agents/ddpg_agent.py @@ -13,28 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import copy -from agents.actor_critic_agent import * -from configurations import * +import numpy as np + +from agents import actor_critic_agent as aca +from agents import agent +from architectures import network_wrapper as nw +import configurations as conf +import utils # Deep Deterministic Policy Gradients Network - https://arxiv.org/pdf/1509.02971.pdf -class DDPGAgent(ActorCriticAgent): +class DDPGAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) # define critic network self.critic_network = self.main_network # self.networks.append(self.critic_network) # define actor network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.Pi] - self.actor_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'actor', - self.replicated_device, self.worker_device) + tuning_parameters.agent.input_types = {'observation': conf.InputTypes.Observation} + tuning_parameters.agent.output_types = [conf.OutputTypes.Pi] + self.actor_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'actor', + self.replicated_device, self.worker_device) self.networks.append(self.actor_network) - self.q_values = Signal("Q") + self.q_values = utils.Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True) @@ -82,14 +88,14 @@ class DDPGAgent(ActorCriticAgent): return total_loss def train(self): - return Agent.train(self) + return agent.Agent.train(self) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): assert not self.env.discrete_controls, 'DDPG works only for continuous control problems' result = self.actor_network.online_network.predict(self.tf_input_state(curr_state)) action_values = result[0].squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/ddqn_agent.py b/agents/ddqn_agent.py index 838ae3f..9f19e8a 100644 --- a/agents/ddqn_agent.py +++ b/agents/ddqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Double DQN - https://arxiv.org/abs/1509.06461 -class DDQNAgent(ValueOptimizationAgent): +class DDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) diff --git a/agents/dfp_agent.py b/agents/dfp_agent.py index c055d2c..f778fff 100644 --- a/agents/dfp_agent.py +++ b/agents/dfp_agent.py @@ -13,17 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.agent import * +from agents import agent +from architectures import network_wrapper as nw +import utils # Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf -class DFPAgent(Agent): +class DFPAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.current_goal = self.tp.agent.goal_vector - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) + self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) def learn_from_batch(self, batch): @@ -45,7 +48,7 @@ class DFPAgent(Agent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network observation = np.expand_dims(np.array(curr_state['observation']), 0) measurements = np.expand_dims(np.array(curr_state['measurements']), 0) @@ -66,7 +69,7 @@ class DFPAgent(Agent): self.tp.agent.future_measurements_weights) # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) diff --git a/agents/distributional_dqn_agent.py b/agents/distributional_dqn_agent.py index d7c0088..33e7a5e 100644 --- a/agents/distributional_dqn_agent.py +++ b/agents/distributional_dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Distributional Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf -class DistributionalDQNAgent(ValueOptimizationAgent): +class DistributionalDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.z_values = np.linspace(self.tp.agent.v_min, self.tp.agent.v_max, self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) @@ -57,4 +58,3 @@ class DistributionalDQNAgent(ValueOptimizationAgent): total_loss = result[0] return total_loss - diff --git a/agents/dqn_agent.py b/agents/dqn_agent.py index 70c0c7d..8660def 100644 --- a/agents/dqn_agent.py +++ b/agents/dqn_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Deep Q Network - https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf -class DQNAgent(ValueOptimizationAgent): +class DQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) def learn_from_batch(self, batch): current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(batch) diff --git a/agents/human_agent.py b/agents/human_agent.py index c75c2a2..7f8e491 100644 --- a/agents/human_agent.py +++ b/agents/human_agent.py @@ -13,31 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections +import os -from agents.agent import * import pygame +from pandas.io import pickle + +from agents import agent +import logger +import utils -class HumanAgent(Agent): +class HumanAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.clock = pygame.time.Clock() self.max_fps = int(self.tp.visualization.max_fps_for_human_control) - screen.log_title("Human Control Mode") + utils.screen.log_title("Human Control Mode") available_keys = self.env.get_available_keys() if available_keys: - screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") - screen.log("") + utils.screen.log("Use keyboard keys to move. Press escape to quit. Available keys:") + utils.screen.log("") for action, key in self.env.get_available_keys(): - screen.log("\t- {}: {}".format(action, key)) - screen.separator() + utils.screen.log("\t- {}: {}".format(action, key)) + utils.screen.separator() def train(self): return 0 - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): action = self.env.get_action_from_user() # keep constant fps @@ -49,16 +55,16 @@ class HumanAgent(Agent): return action, {"action_value": 0} def save_replay_buffer_and_exit(self): - replay_buffer_path = os.path.join(logger.experiments_path, 'replay_buffer.p') + replay_buffer_path = os.path.join(logger.logger.experiments_path, 'replay_buffer.p') self.memory.tp = None - to_pickle(self.memory, replay_buffer_path) - screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path)) + pickle.to_pickle(self.memory, replay_buffer_path) + utils.screen.log_title("Replay buffer was stored in {}".format(replay_buffer_path)) exit() def log_to_screen(self, phase): - # log to screen - screen.log_dict( - OrderedDict([ + # log to utils.screen + utils.screen.log_dict( + collections.OrderedDict([ ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), ("steps", self.total_steps_counter) diff --git a/agents/imitation_agent.py b/agents/imitation_agent.py index f893fbe..522c569 100644 --- a/agents/imitation_agent.py +++ b/agents/imitation_agent.py @@ -13,23 +13,27 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections -from agents.agent import * +from agents import agent +from architectures import network_wrapper as nw +import utils +import logging # Imitation Agent -class ImitationAgent(Agent): +class ImitationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, False, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, False, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.imitation = True def extract_action_values(self, prediction): return prediction.squeeze() - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network prediction = self.main_network.online_network.predict(self.tf_input_state(curr_state)) @@ -49,10 +53,10 @@ class ImitationAgent(Agent): def log_to_screen(self, phase): # log to screen - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: # for the training phase - we log during the episode to visualize the progress in training - screen.log_dict( - OrderedDict([ + logging.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("Loss", self.loss.values[-1]), @@ -62,4 +66,4 @@ class ImitationAgent(Agent): ) else: # for the evaluation phase - logging as in regular RL - Agent.log_to_screen(self, phase) + agent.Agent.log_to_screen(self, phase) diff --git a/agents/mmc_agent.py b/agents/mmc_agent.py index 2b5a2cb..4473b06 100644 --- a/agents/mmc_agent.py +++ b/agents/mmc_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa -class MixedMonteCarloAgent(ValueOptimizationAgent): +class MixedMonteCarloAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate def learn_from_batch(self, batch): diff --git a/agents/n_step_q_agent.py b/agents/n_step_q_agent.py index 5a74fb5..a5b773c 100644 --- a/agents/n_step_q_agent.py +++ b/agents/n_step_q_agent.py @@ -14,22 +14,21 @@ # limitations under the License. # import numpy as np -import scipy.signal -from agents.value_optimization_agent import ValueOptimizationAgent -from agents.policy_optimization_agent import PolicyOptimizationAgent -from logger import logger -from utils import Signal, last_sample +from agents import value_optimization_agent as voa +from agents import policy_optimization_agent as poa +import logger +import utils # N Step Q Learning Agent - https://arxiv.org/abs/1602.01783 -class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): +class NStepQAgent(voa.ValueOptimizationAgent, poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) self.last_gradient_update_step_idx = 0 - self.q_values = Signal('Q Values') - self.unclipped_grads = Signal('Grads (unclipped)') - self.value_loss = Signal('Value Loss') + self.q_values = utils.Signal('Q Values') + self.unclipped_grads = utils.Signal('Grads (unclipped)') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.q_values) self.signals.append(self.unclipped_grads) self.signals.append(self.value_loss) @@ -57,7 +56,7 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): if game_overs[-1]: R = 0 else: - R = np.max(self.main_network.target_network.predict(last_sample(next_states))) + R = np.max(self.main_network.target_network.predict(utils.last_sample(next_states))) for i in reversed(range(num_transitions)): R = rewards[i] + self.tp.agent.discount * R @@ -85,4 +84,4 @@ class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): else: logger.create_signal_value('Update Target Network', 0, overwrite=False) - return PolicyOptimizationAgent.train(self) + return poa.PolicyOptimizationAgent.train(self) diff --git a/agents/naf_agent.py b/agents/naf_agent.py index 65ca83c..35072f7 100644 --- a/agents/naf_agent.py +++ b/agents/naf_agent.py @@ -13,21 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np from agents.value_optimization_agent import ValueOptimizationAgent -from utils import RunPhase, Signal +import utils # Normalized Advantage Functions - https://arxiv.org/pdf/1603.00748.pdf class NAFAgent(ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.l_values = Signal("L") - self.a_values = Signal("Advantage") - self.mu_values = Signal("Action") - self.v_values = Signal("V") + self.l_values = utils.Signal("L") + self.a_values = utils.Signal("Advantage") + self.mu_values = utils.Signal("Action") + self.v_values = utils.Signal("V") self.signals += [self.l_values, self.a_values, self.mu_values, self.v_values] def learn_from_batch(self, batch): @@ -49,7 +48,7 @@ class NAFAgent(ValueOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): assert not self.env.discrete_controls, 'NAF works only for continuous control problems' # convert to batch so we can run it through the network @@ -60,7 +59,7 @@ class NAFAgent(ValueOptimizationAgent): outputs=naf_head.mu, squeeze_output=False, ) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/nec_agent.py b/agents/nec_agent.py index 77520a4..47aa33f 100644 --- a/agents/nec_agent.py +++ b/agents/nec_agent.py @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import numpy as np - -from agents.value_optimization_agent import ValueOptimizationAgent +from agents import value_optimization_agent as voa from logger import screen -from utils import RunPhase +import utils # Neural Episodic Control - https://arxiv.org/pdf/1703.01988.pdf -class NECAgent(ValueOptimizationAgent): +class NECAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=False) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=False) self.current_episode_state_embeddings = [] self.training_started = False @@ -52,7 +49,7 @@ class NECAgent(ValueOptimizationAgent): return total_loss - def act(self, phase=RunPhase.TRAIN): + def act(self, phase=utils.RunPhase.TRAIN): if self.in_heatup: # get embedding in heatup (otherwise we get it through choose_action) embedding = self.main_network.online_network.predict( diff --git a/agents/pal_agent.py b/agents/pal_agent.py index 68ff675..9a11e00 100644 --- a/agents/pal_agent.py +++ b/agents/pal_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Persistent Advantage Learning - https://arxiv.org/pdf/1512.04860.pdf -class PALAgent(ValueOptimizationAgent): +class PALAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.alpha = tuning_parameters.agent.pal_alpha self.persistent = tuning_parameters.agent.persistent_advantage_learning self.monte_carlo_mixing_rate = tuning_parameters.agent.monte_carlo_mixing_rate diff --git a/agents/policy_gradients_agent.py b/agents/policy_gradients_agent.py index 3a592d1..037dc88 100644 --- a/agents/policy_gradients_agent.py +++ b/agents/policy_gradients_agent.py @@ -13,25 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from agents.policy_optimization_agent import * import numpy as np -from logger import * -import tensorflow as tf -try: - import matplotlib.pyplot as plt -except: - from logger import failed_imports - failed_imports.append("matplotlib") -from utils import * +from agents import policy_optimization_agent as poa +import logger +import utils -class PolicyGradientsAgent(PolicyOptimizationAgent): +class PolicyGradientsAgent(poa.PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.returns_mean = Signal('Returns Mean') - self.returns_variance = Signal('Returns Variance') + poa.PolicyOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.returns_mean = utils.Signal('Returns Mean') + self.returns_variance = utils.Signal('Returns Variance') self.signals.append(self.returns_mean) self.signals.append(self.returns_variance) self.last_gradient_update_step_idx = 0 @@ -41,21 +34,21 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): current_states, next_states, actions, rewards, game_overs, total_returns = self.extract_batch(batch) for i in reversed(range(len(total_returns))): - if self.policy_gradient_rescaler == PolicyGradientRescaler.TOTAL_RETURN: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.TOTAL_RETURN: total_returns[i] = total_returns[0] - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN: # just take the total return as it is pass - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_EPISODE: # we can get a single transition episode while playing Doom Basic, causing the std to be 0 if self.std_discounted_return != 0: total_returns[i] = (total_returns[i] - self.mean_discounted_return) / self.std_discounted_return else: total_returns[i] = 0 - elif self.policy_gradient_rescaler == PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.FUTURE_RETURN_NORMALIZED_BY_TIMESTEP: total_returns[i] -= self.mean_return_over_multiple_episodes[i] else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") targets = total_returns if not self.env.discrete_controls and len(actions.shape) < 2: @@ -69,12 +62,12 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): return total_loss - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): # convert to batch so we can run it through the network if self.env.discrete_controls: # DISCRETE action_values = self.main_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -84,7 +77,7 @@ class PolicyGradientsAgent(PolicyOptimizationAgent): # CONTINUOUS result = self.main_network.online_network.predict(self.tf_input_state(curr_state)) action_values = result[0].squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values diff --git a/agents/policy_optimization_agent.py b/agents/policy_optimization_agent.py index be23760..175ca7f 100644 --- a/agents/policy_optimization_agent.py +++ b/agents/policy_optimization_agent.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,12 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections -from agents.agent import * -from memories.memory import Episode +import numpy as np + +from agents import agent +from architectures import network_wrapper as nw +import logger +import utils -class PolicyGradientRescaler(Enum): +class PolicyGradientRescaler(utils.Enum): TOTAL_RETURN = 0 FUTURE_RETURN = 1 FUTURE_RETURN_NORMALIZED_BY_EPISODE = 2 @@ -30,11 +35,11 @@ class PolicyGradientRescaler(Enum): GAE = 8 -class PolicyOptimizationAgent(Agent): +class PolicyOptimizationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=False): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.policy_gradient_rescaler = PolicyGradientRescaler().get(self.tp.agent.policy_gradient_rescaler) @@ -44,7 +49,7 @@ class PolicyOptimizationAgent(Agent): self.max_episode_length = 100000 self.mean_return_over_multiple_episodes = np.zeros(self.max_episode_length) self.num_episodes_where_step_has_been_seen = np.zeros(self.max_episode_length) - self.entropy = Signal('Entropy') + self.entropy = utils.Signal('Entropy') self.signals.append(self.entropy) self.reset_game(do_not_reset_env=True) @@ -52,8 +57,8 @@ class PolicyOptimizationAgent(Agent): def log_to_screen(self, phase): # log to screen if self.current_episode > 0: - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py index 4a37e69..35d1b98 100644 --- a/agents/ppo_agent.py +++ b/agents/ppo_agent.py @@ -13,36 +13,44 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import collections +import copy -from agents.actor_critic_agent import * -from random import shuffle +import numpy as np + +from agents import actor_critic_agent as aca +from agents import policy_optimization_agent as poa +from architectures import network_wrapper as nw +import configurations +import logger +import utils # Proximal Policy Optimization - https://arxiv.org/pdf/1707.06347.pdf -class PPOAgent(ActorCriticAgent): +class PPOAgent(aca.ActorCriticAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, - create_target_network=True) + aca.ActorCriticAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, + create_target_network=True) self.critic_network = self.main_network # define the policy network - tuning_parameters.agent.input_types = {'observation': InputTypes.Observation} - tuning_parameters.agent.output_types = [OutputTypes.PPO] + tuning_parameters.agent.input_types = {'observation': configurations.InputTypes.Observation} + tuning_parameters.agent.output_types = [configurations.OutputTypes.PPO] tuning_parameters.agent.optimizer_type = 'Adam' tuning_parameters.agent.l2_regularization = 0 - self.policy_network = NetworkWrapper(tuning_parameters, True, self.has_global, 'policy', - self.replicated_device, self.worker_device) + self.policy_network = nw.NetworkWrapper(tuning_parameters, True, self.has_global, 'policy', + self.replicated_device, self.worker_device) self.networks.append(self.policy_network) # signals definition - self.value_loss = Signal('Value Loss') + self.value_loss = utils.Signal('Value Loss') self.signals.append(self.value_loss) - self.policy_loss = Signal('Policy Loss') + self.policy_loss = utils.Signal('Policy Loss') self.signals.append(self.policy_loss) - self.kl_divergence = Signal('KL Divergence') + self.kl_divergence = utils.Signal('KL Divergence') self.signals.append(self.kl_divergence) self.total_kl_divergence_during_training_process = 0.0 - self.unclipped_grads = Signal('Grads (unclipped)') + self.unclipped_grads = utils.Signal('Grads (unclipped)') self.signals.append(self.unclipped_grads) self.reset_game(do_not_reset_env=True) @@ -57,9 +65,9 @@ class PPOAgent(ActorCriticAgent): # calculate advantages advantages = [] - if self.policy_gradient_rescaler == PolicyGradientRescaler.A_VALUE: + if self.policy_gradient_rescaler == poa.PolicyGradientRescaler.A_VALUE: advantages = total_return - current_state_values - elif self.policy_gradient_rescaler == PolicyGradientRescaler.GAE: + elif self.policy_gradient_rescaler == poa.PolicyGradientRescaler.GAE: # get bootstraps episode_start_idx = 0 advantages = np.array([]) @@ -76,7 +84,7 @@ class PPOAgent(ActorCriticAgent): episode_start_idx = idx + 1 advantages = np.append(advantages, rollout_advantages) else: - screen.warning("WARNING: The requested policy gradient rescaler is not available") + logger.screen.warning("WARNING: The requested policy gradient rescaler is not available") # standardize advantages = (advantages - np.mean(advantages)) / np.std(advantages) @@ -107,7 +115,7 @@ class PPOAgent(ActorCriticAgent): for k, v in current_states.items() } total_return_batch = total_return[i * batch_size:(i + 1) * batch_size] - old_policy_values = force_list(self.critic_network.target_network.predict( + old_policy_values = utils.force_list(self.critic_network.target_network.predict( current_states_batch).squeeze()) if self.critic_network.online_network.optimizer_type != 'LBFGS': targets = total_return_batch @@ -155,7 +163,7 @@ class PPOAgent(ActorCriticAgent): actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution - old_policy = force_list(self.policy_network.target_network.predict(current_states)) + old_policy = utils.force_list(self.policy_network.target_network.predict(current_states)) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [self.policy_network.online_network.output_heads[0].kl_divergence, @@ -196,8 +204,8 @@ class PPOAgent(ActorCriticAgent): curr_learning_rate = self.tp.learning_rate # log training parameters - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), @@ -215,7 +223,7 @@ class PPOAgent(ActorCriticAgent): def update_kl_coefficient(self): # John Schulman takes the mean kl divergence only over the last epoch which is strange but we will follow # his implementation for now because we know it works well - screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process)) + logger.screen.log_title("KL = {}".format(self.total_kl_divergence_during_training_process)) # update kl coefficient kl_target = self.tp.agent.target_kl_divergence @@ -236,7 +244,7 @@ class PPOAgent(ActorCriticAgent): new_kl_coefficient, self.policy_network.online_network.output_heads[0].kl_coefficient_ph) - screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) + logger.screen.log_title("KL penalty coefficient change = {} -> {}".format(kl_coefficient, new_kl_coefficient)) def post_training_commands(self): if self.tp.agent.use_kl_regularization: @@ -264,12 +272,12 @@ class PPOAgent(ActorCriticAgent): self.update_log() # should be done in order to update the data that has been accumulated * while not playing * return np.append(value_loss, policy_loss) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): if self.env.discrete_controls: # DISCRETE action_values = self.policy_network.online_network.predict(self.tf_input_state(curr_state)).squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = np.argmax(action_values) @@ -280,7 +288,7 @@ class PPOAgent(ActorCriticAgent): action_values_mean, action_values_std = self.policy_network.online_network.predict(self.tf_input_state(curr_state)) action_values_mean = action_values_mean.squeeze() action_values_std = action_values_std.squeeze() - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: action = np.squeeze(np.random.randn(1, self.action_space_size) * action_values_std + action_values_mean) else: action = action_values_mean diff --git a/agents/qr_dqn_agent.py b/agents/qr_dqn_agent.py index 8888d18..c36b861 100644 --- a/agents/qr_dqn_agent.py +++ b/agents/qr_dqn_agent.py @@ -13,14 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import numpy as np -from agents.value_optimization_agent import * +from agents import value_optimization_agent as voa # Quantile Regression Deep Q Network - https://arxiv.org/pdf/1710.10044v1.pdf -class QuantileRegressionDQNAgent(ValueOptimizationAgent): +class QuantileRegressionDQNAgent(voa.ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): - ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + voa.ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.quantile_probabilities = np.ones(self.tp.agent.atoms) / float(self.tp.agent.atoms) # prediction's format is (batch,actions,atoms) diff --git a/agents/value_optimization_agent.py b/agents/value_optimization_agent.py index 75708d7..d91a98e 100644 --- a/agents/value_optimization_agent.py +++ b/agents/value_optimization_agent.py @@ -13,21 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import numpy as np -from agents.agent import Agent -from architectures.network_wrapper import NetworkWrapper -from utils import RunPhase, Signal +from agents import agent +from architectures import network_wrapper as nw +import utils -class ValueOptimizationAgent(Agent): +class ValueOptimizationAgent(agent.Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): - Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) - self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', - self.replicated_device, self.worker_device) + agent.Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) + self.main_network = nw.NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', + self.replicated_device, self.worker_device) self.networks.append(self.main_network) - self.q_values = Signal("Q") + self.q_values = utils.Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True) @@ -47,12 +46,12 @@ class ValueOptimizationAgent(Agent): 'require exploration policies which return a single action.' ).format(policy.__class__.__name__)) - def choose_action(self, curr_state, phase=RunPhase.TRAIN): + def choose_action(self, curr_state, phase=utils.RunPhase.TRAIN): prediction = self.get_prediction(curr_state) actions_q_values = self.get_q_values(prediction) # choose action according to the exploration policy and the current phase (evaluating or training the agent) - if phase == RunPhase.TRAIN: + if phase == utils.RunPhase.TRAIN: exploration_policy = self.exploration_policy else: exploration_policy = self.evaluation_exploration_policy diff --git a/architectures/__init__.py b/architectures/__init__.py index cbf2ac5..e72fb00 100644 --- a/architectures/__init__.py +++ b/architectures/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from architectures.architecture import * -from logger import failed_imports -try: - from architectures.tensorflow_components.general_network import * - from architectures.tensorflow_components.architecture import * -except ImportError: - failed_imports.append("TensorFlow") +import logger try: - from architectures.neon_components.general_network import * - from architectures.neon_components.architecture import * + from architectures.tensorflow_components import general_network as ts_gn + from architectures.tensorflow_components import architecture as ts_arch except ImportError: - failed_imports.append("Neon") + logger.failed_imports.append("TensorFlow") -from architectures.network_wrapper import * \ No newline at end of file +try: + from architectures.neon_components import general_network as neon_gn + from architectures.neon_components import architecture as neon_arch +except ImportError: + logger.failed_imports.append("Neon") diff --git a/architectures/architecture.py b/architectures/architecture.py index d3175b7..03c48d8 100644 --- a/architectures/architecture.py +++ b/architectures/architecture.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,8 +14,6 @@ # limitations under the License. # -from configurations import Preset - class Architecture(object): def __init__(self, tuning_parameters, name=""): @@ -73,4 +71,4 @@ class Architecture(object): pass def set_variable_value(self, assign_op, value, placeholder=None): - pass \ No newline at end of file + pass diff --git a/architectures/neon_components/architecture.py b/architectures/neon_components/architecture.py index de600c1..1577ed8 100644 --- a/architectures/neon_components/architecture.py +++ b/architectures/neon_components/architecture.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,19 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import sys -import copy -from ngraph.frontends.neon import * import ngraph as ng -from architectures.architecture import * import numpy as np -from utils import * + +from architectures import architecture +import utils -class NeonArchitecture(Architecture): +class NeonArchitecture(architecture.Architecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): - Architecture.__init__(self, tuning_parameters, name) + architecture.Architecture.__init__(self, tuning_parameters, name) assert tuning_parameters.agent.neon_support, 'Neon is not supported for this agent' self.clip_error = tuning_parameters.clip_gradients self.total_loss = None @@ -113,8 +110,8 @@ class NeonArchitecture(Architecture): def accumulate_gradients(self, inputs, targets): # Neon doesn't currently allow separating the grads calculation and grad apply operations # so this feature is not currently available. instead we do a full training iteration - inputs = force_list(inputs) - targets = force_list(targets) + inputs = utils.force_list(inputs) + targets = utils.force_list(targets) for idx, input in enumerate(inputs): inputs[idx] = input.swapaxes(0, -1) diff --git a/architectures/neon_components/embedders.py b/architectures/neon_components/embedders.py index 5f594a3..9d20a9d 100644 --- a/architectures/neon_components/embedders.py +++ b/architectures/neon_components/embedders.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import ngraph.frontends.neon as neon import ngraph as ng -from ngraph.util.names import name_scope +import ngraph.frontends.neon as neon +import ngraph.util.names as ngraph_names class InputEmbedder(object): @@ -31,7 +30,7 @@ class InputEmbedder(object): self.output = None def __call__(self, prev_input_placeholder=None): - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): # create the input axes axes = [] if len(self.input_size) == 2: diff --git a/architectures/neon_components/general_network.py b/architectures/neon_components/general_network.py index 99ac6e9..f837f71 100644 --- a/architectures/neon_components/general_network.py +++ b/architectures/neon_components/general_network.py @@ -13,15 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import ngraph as ng +from ngraph.frontends import neon +from ngraph.util import names as ngraph_names -from architectures.neon_components.embedders import * -from architectures.neon_components.heads import * -from architectures.neon_components.middleware import * -from architectures.neon_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes +from architectures.neon_components import architecture +from architectures.neon_components import embedders +from architectures.neon_components import middleware +from architectures.neon_components import heads +import configurations as conf -class GeneralNeonNetwork(NeonArchitecture): +class GeneralNeonNetwork(architecture.NeonArchitecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): self.global_network = global_network self.network_is_local = network_is_local @@ -34,7 +37,7 @@ class GeneralNeonNetwork(NeonArchitecture): self.activation_function = self.get_activation_function( tuning_parameters.agent.hidden_layers_activation_function) - NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) + architecture.NeonArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) def get_activation_function(self, activation_function_string): activation_functions = { @@ -53,36 +56,36 @@ class GeneralNeonNetwork(NeonArchitecture): # the observation can be either an image or a vector def get_observation_embedding(with_timestep=False): if self.input_height > 1: - return ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size, - name="observation") + return embedders.ImageEmbedder((self.input_depth, self.input_height, self.input_width), self.batch_size, + name="observation") else: - return VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size, - name="observation") + return embedders.VectorEmbedder((self.input_depth, self.input_width + int(with_timestep)), self.batch_size, + name="observation") input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), self.batch_size, name="action"), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), + conf.InputTypes.Observation: get_observation_embedding(), + conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="measurements"), + conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, self.batch_size, name="goal_vector"), + conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), self.batch_size, name="action"), + conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), } return input_mapping[embedder_type] def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) + return {conf.MiddlewareTypes.LSTM: None, # LSTM over Neon is currently not supported in Coach + conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function) def get_output_head(self, head_type, head_idx, loss_weight=1.): output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach - OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach - OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach - OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach - OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach - OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach + conf.OutputTypes.Q: heads.QHead, + conf.OutputTypes.DuelingQ: heads.DuelingQHead, + conf.OutputTypes.V: None, # Policy Optimization algorithms over Neon are currently not supported in Coach + conf.OutputTypes.Pi: None, # Policy Optimization algorithms over Neon are currently not supported in Coach + conf.OutputTypes.MeasurementsPrediction: None, # DFP over Neon is currently not supported in Coach + conf.OutputTypes.DNDQ: None, # NEC over Neon is currently not supported in Coach + conf.OutputTypes.NAF: None, # NAF over Neon is currently not supported in Coach + conf.OutputTypes.PPO: None, # PPO over Neon is currently not supported in Coach + conf.OutputTypes.PPO_V: None # PPO over Neon is currently not supported in Coach } return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) @@ -104,7 +107,7 @@ class GeneralNeonNetwork(NeonArchitecture): done_creating_input_placeholders = False for network_idx in range(self.num_networks): - with name_scope('network_{}'.format(network_idx)): + with ngraph_names.name_scope('network_{}'.format(network_idx)): #################### # Input Embeddings # #################### diff --git a/architectures/neon_components/heads.py b/architectures/neon_components/heads.py index df49867..21af758 100644 --- a/architectures/neon_components/heads.py +++ b/architectures/neon_components/heads.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,13 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import ngraph as ng -from ngraph.util.names import name_scope -import ngraph.frontends.neon as neon -import numpy as np -from utils import force_list -from architectures.neon_components.losses import * +from ngraph.frontends import neon +from ngraph.util import names as ngraph_names + +import utils +from architectures.neon_components import losses class Head(object): @@ -30,7 +29,7 @@ class Head(object): self.loss = [] self.loss_type = [] self.regularizations = [] - self.loss_weight = force_list(loss_weight) + self.loss_weight = utils.force_list(loss_weight) self.weights_init = neon.GlorotInit() self.biases_init = neon.ConstantInit() self.target = [] @@ -44,15 +43,15 @@ class Head(object): :param input_layer: the input to the graph :return: the output of the last layer and the target placeholder """ - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): self._build_module(input_layer) - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) + self.output = utils.force_list(self.output) + self.target = utils.force_list(self.target) + self.input = utils.force_list(self.input) + self.loss_type = utils.force_list(self.loss_type) + self.loss = utils.force_list(self.loss) + self.regularizations = utils.force_list(self.regularizations) if self.is_local: self.set_loss() @@ -106,7 +105,7 @@ class QHead(Head): if tuning_parameters.agent.replace_mse_with_huber_loss: raise Exception("huber loss is not supported in neon") else: - self.loss_type = mean_squared_error + self.loss_type = losses.mean_squared_error def _build_module(self, input_layer): # Standard Q Network @@ -159,7 +158,7 @@ class MeasurementsPredictionHead(Head): if tuning_parameters.agent.replace_mse_with_huber_loss: raise Exception("huber loss is not supported in neon") else: - self.loss_type = mean_squared_error + self.loss_type = losses.mean_squared_error def _build_module(self, input_layer): # This is almost exactly the same as Dueling Network but we predict the future measurements for each action @@ -167,7 +166,7 @@ class MeasurementsPredictionHead(Head): multistep_measurements_size = self.measurements_size[0] * self.num_predicted_steps_ahead # actions expectation tower (expectation stream) - E - with name_scope("expectation_stream"): + with ngraph_names.name_scope("expectation_stream"): expectation_stream = neon.Sequential([ neon.Affine(nout=256, activation=neon.Rectlin(), weight_init=self.weights_init, bias_init=self.biases_init), @@ -176,7 +175,7 @@ class MeasurementsPredictionHead(Head): ])(input_layer) # action fine differences tower (action stream) - A - with name_scope("action_stream"): + with ngraph_names.name_scope("action_stream"): action_stream_unnormalized = neon.Sequential([ neon.Affine(nout=256, activation=neon.Rectlin(), weight_init=self.weights_init, bias_init=self.biases_init), @@ -191,4 +190,3 @@ class MeasurementsPredictionHead(Head): # merge to future measurements predictions self.output = repeated_expectation_stream + action_stream - diff --git a/architectures/neon_components/losses.py b/architectures/neon_components/losses.py index 26e8644..a6fc064 100644 --- a/architectures/neon_components/losses.py +++ b/architectures/neon_components/losses.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,15 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import ngraph as ng -import ngraph.frontends.neon as neon -from ngraph.util.names import name_scope -import numpy as np +from ngraph.util import names as ngraph_names def mean_squared_error(targets, outputs, weights=1.0, scope=""): - with name_scope(scope): + with ngraph_names.name_scope(scope): # TODO: reduce mean over the action axis loss = ng.squared_L2(targets - outputs) weighted_loss = loss * weights diff --git a/architectures/neon_components/middleware.py b/architectures/neon_components/middleware.py index 2aa02fd..fad7b9c 100644 --- a/architectures/neon_components/middleware.py +++ b/architectures/neon_components/middleware.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import ngraph as ng import ngraph.frontends.neon as neon -from ngraph.util.names import name_scope -import numpy as np +from ngraph.util import names as ngraph_names class MiddlewareEmbedder(object): @@ -30,7 +27,7 @@ class MiddlewareEmbedder(object): self.activation_function = activation_function def __call__(self, input_layer): - with name_scope(self.get_name()): + with ngraph_names.name_scope(self.get_name()): self.input = input_layer self._build_module() diff --git a/architectures/network_wrapper.py b/architectures/network_wrapper.py index ef026e6..d21bd77 100644 --- a/architectures/network_wrapper.py +++ b/architectures/network_wrapper.py @@ -13,20 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import os +import collections -from collections import OrderedDict -from configurations import Preset, Frameworks -from logger import * +import configurations as conf +import logger try: import tensorflow as tf - from architectures.tensorflow_components.general_network import GeneralTensorFlowNetwork + from architectures.tensorflow_components import general_network as tf_net #import GeneralTensorFlowNetwork except ImportError: - failed_imports.append("TensorFlow") + logger.failed_imports.append("TensorFlow") try: - from architectures.neon_components.general_network import GeneralNeonNetwork + from architectures.neon_components import general_network as neon_net except ImportError: - failed_imports.append("Neon") + logger.failed_imports.append("Neon") class NetworkWrapper(object): @@ -50,12 +51,12 @@ class NetworkWrapper(object): self.name = name self.sess = tuning_parameters.sess - if self.tp.framework == Frameworks.TensorFlow: - general_network = GeneralTensorFlowNetwork - elif self.tp.framework == Frameworks.Neon: - general_network = GeneralNeonNetwork + if self.tp.framework == conf.Frameworks.TensorFlow: + general_network = tf_net.GeneralTensorFlowNetwork + elif self.tp.framework == conf.Frameworks.Neon: + general_network = neon_net.GeneralNeonNetwork else: - raise Exception("{} Framework is not supported".format(Frameworks().to_string(self.tp.framework))) + raise Exception("{} Framework is not supported".format(conf.Frameworks().to_string(self.tp.framework))) # Global network - the main network shared between threads self.global_network = None @@ -77,13 +78,13 @@ class NetworkWrapper(object): self.target_network = general_network(tuning_parameters, '{}/target'.format(name), network_is_local=True) - if not self.tp.distributed and self.tp.framework == Frameworks.TensorFlow: + if not self.tp.distributed and self.tp.framework == conf.Frameworks.TensorFlow: variables_to_restore = tf.global_variables() variables_to_restore = [v for v in variables_to_restore if '/online' in v.name] self.model_saver = tf.train.Saver(variables_to_restore) if self.tp.sess and self.tp.checkpoint_restore_dir: checkpoint = tf.train.latest_checkpoint(self.tp.checkpoint_restore_dir) - screen.log_title("Loading checkpoint: {}".format(checkpoint)) + logger.screen.log_title("Loading checkpoint: {}".format(checkpoint)) self.model_saver.restore(self.tp.sess, checkpoint) self.update_target_network() @@ -178,8 +179,8 @@ class NetworkWrapper(object): def save_model(self, model_id): saved_model_path = self.model_saver.save(self.tp.sess, os.path.join(self.tp.save_model_dir, str(model_id) + '.ckpt')) - screen.log_dict( - OrderedDict([ + logger.screen.log_dict( + collections.OrderedDict([ ("Saving model", saved_model_path), ]), prefix="Checkpoint" diff --git a/architectures/tensorflow_components/architecture.py b/architectures/tensorflow_components/architecture.py index 3474ff4..41396b1 100644 --- a/architectures/tensorflow_components/architecture.py +++ b/architectures/tensorflow_components/architecture.py @@ -15,12 +15,11 @@ # import time -import numpy as np import tensorflow as tf -from architectures.architecture import Architecture -from utils import force_list, squeeze_list -from configurations import Preset, MiddlewareTypes +from architectures import architecture +import configurations as conf +import utils def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" @@ -37,14 +36,14 @@ def variable_summaries(var): tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) -class TensorFlowArchitecture(Architecture): +class TensorFlowArchitecture(architecture.Architecture): def __init__(self, tuning_parameters, name="", global_network=None, network_is_local=True): """ :param tuning_parameters: The parameters used for running the algorithm :type tuning_parameters: Preset :param name: The name of the network """ - Architecture.__init__(self, tuning_parameters, name) + architecture.Architecture.__init__(self, tuning_parameters, name) self.middleware_embedder = None self.network_is_local = network_is_local assert tuning_parameters.agent.tensorflow_support, 'TensorFlow is not supported for this agent' @@ -174,7 +173,7 @@ class TensorFlowArchitecture(Architecture): feed_dict = self._feed_dict(inputs) # feed targets - targets = force_list(targets) + targets = utils.force_list(targets) for placeholder_idx, target in enumerate(targets): feed_dict[self.targets[placeholder_idx]] = target @@ -186,13 +185,13 @@ class TensorFlowArchitecture(Architecture): else: fetches.append(self.tensor_gradients) fetches += [self.total_loss, self.losses] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: fetches.append(self.middleware_embedder.state_out) additional_fetches_start_idx = len(fetches) fetches += additional_fetches # feed the lstm state if necessary - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: # we can't always assume that we are starting from scratch here can we? feed_dict[self.middleware_embedder.c_in] = self.middleware_embedder.c_init feed_dict[self.middleware_embedder.h_in] = self.middleware_embedder.h_init @@ -206,7 +205,7 @@ class TensorFlowArchitecture(Architecture): # extract the fetches norm_unclipped_grads, grads, total_loss, losses = result[:4] - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4] fetched_tensors = [] if len(additional_fetches) > 0: @@ -308,7 +307,7 @@ class TensorFlowArchitecture(Architecture): if outputs is None: outputs = self.outputs - if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: + if self.tp.agent.middleware_type == conf.MiddlewareTypes.LSTM: feed_dict[self.middleware_embedder.c_in] = self.curr_rnn_c_in feed_dict[self.middleware_embedder.h_in] = self.curr_rnn_h_in @@ -317,7 +316,7 @@ class TensorFlowArchitecture(Architecture): output = self.tp.sess.run(outputs, feed_dict) if squeeze_output: - output = squeeze_list(output) + output = utils.squeeze_list(output) return output diff --git a/architectures/tensorflow_components/embedders.py b/architectures/tensorflow_components/embedders.py index 880de2f..b0a4ff1 100644 --- a/architectures/tensorflow_components/embedders.py +++ b/architectures/tensorflow_components/embedders.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf + from configurations import EmbedderComplexity diff --git a/architectures/tensorflow_components/general_network.py b/architectures/tensorflow_components/general_network.py index 03bb2a9..b9b4a78 100644 --- a/architectures/tensorflow_components/general_network.py +++ b/architectures/tensorflow_components/general_network.py @@ -13,15 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import tensorflow as tf -from architectures.tensorflow_components.embedders import * -from architectures.tensorflow_components.heads import * -from architectures.tensorflow_components.middleware import * -from architectures.tensorflow_components.architecture import * -from configurations import InputTypes, OutputTypes, MiddlewareTypes +from architectures.tensorflow_components import architecture +from architectures.tensorflow_components import embedders +from architectures.tensorflow_components import middleware +from architectures.tensorflow_components import heads +import configurations as conf -class GeneralTensorFlowNetwork(TensorFlowArchitecture): +class GeneralTensorFlowNetwork(architecture.TensorFlowArchitecture): """ A generalized version of all possible networks implemented using tensorflow. """ @@ -37,7 +38,7 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): self.activation_function = self.get_activation_function( tuning_parameters.agent.hidden_layers_activation_function) - TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) + architecture.TensorFlowArchitecture.__init__(self, tuning_parameters, name, global_network, network_is_local) def get_activation_function(self, activation_function_string): activation_functions = { @@ -56,37 +57,37 @@ class GeneralTensorFlowNetwork(TensorFlowArchitecture): # the observation can be either an image or a vector def get_observation_embedding(with_timestep=False): if self.input_height > 1: - return ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", - input_rescaler=self.tp.agent.input_rescaler) + return embedders.ImageEmbedder((self.input_height, self.input_width, self.input_depth), name="observation", + input_rescaler=self.tp.agent.input_rescaler) else: - return VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") + return embedders.VectorEmbedder((self.input_width + int(with_timestep), self.input_depth), name="observation") input_mapping = { - InputTypes.Observation: get_observation_embedding(), - InputTypes.Measurements: VectorEmbedder(self.measurements_size, name="measurements"), - InputTypes.GoalVector: VectorEmbedder(self.measurements_size, name="goal_vector"), - InputTypes.Action: VectorEmbedder((self.num_actions,), name="action"), - InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), + conf.InputTypes.Observation: get_observation_embedding(), + conf.InputTypes.Measurements: embedders.VectorEmbedder(self.measurements_size, name="measurements"), + conf.InputTypes.GoalVector: embedders.VectorEmbedder(self.measurements_size, name="goal_vector"), + conf.InputTypes.Action: embedders.VectorEmbedder((self.num_actions,), name="action"), + conf.InputTypes.TimedObservation: get_observation_embedding(with_timestep=True), } return input_mapping[embedder_type] def get_middleware_embedder(self, middleware_type): - return {MiddlewareTypes.LSTM: LSTM_Embedder, - MiddlewareTypes.FC: FC_Embedder}.get(middleware_type)(self.activation_function) + return {conf.MiddlewareTypes.LSTM: middleware.LSTM_Embedder, + conf.MiddlewareTypes.FC: middleware.FC_Embedder}.get(middleware_type)(self.activation_function) def get_output_head(self, head_type, head_idx, loss_weight=1.): output_mapping = { - OutputTypes.Q: QHead, - OutputTypes.DuelingQ: DuelingQHead, - OutputTypes.V: VHead, - OutputTypes.Pi: PolicyHead, - OutputTypes.MeasurementsPrediction: MeasurementsPredictionHead, - OutputTypes.DNDQ: DNDQHead, - OutputTypes.NAF: NAFHead, - OutputTypes.PPO: PPOHead, - OutputTypes.PPO_V: PPOVHead, - OutputTypes.CategoricalQ: CategoricalQHead, - OutputTypes.QuantileRegressionQ: QuantileRegressionQHead + conf.OutputTypes.Q: heads.QHead, + conf.OutputTypes.DuelingQ: heads.DuelingQHead, + conf.OutputTypes.V: heads.VHead, + conf.OutputTypes.Pi: heads.PolicyHead, + conf.OutputTypes.MeasurementsPrediction: heads.MeasurementsPredictionHead, + conf.OutputTypes.DNDQ: heads.DNDQHead, + conf.OutputTypes.NAF: heads.NAFHead, + conf.OutputTypes.PPO: heads.PPOHead, + conf.OutputTypes.PPO_V: heads.PPOVHead, + conf.OutputTypes.CategoricalQ: heads.CategoricalQHead, + conf.OutputTypes.QuantileRegressionQ: heads.QuantileRegressionQHead } return output_mapping[head_type](self.tp, head_idx, loss_weight, self.network_is_local) diff --git a/architectures/tensorflow_components/heads.py b/architectures/tensorflow_components/heads.py index 616ab13..fdf1919 100644 --- a/architectures/tensorflow_components/heads.py +++ b/architectures/tensorflow_components/heads.py @@ -13,10 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np -from utils import force_list + +import utils # Used to initialize weights for policy and value output layers @@ -36,7 +36,7 @@ class Head(object): self.loss = [] self.loss_type = [] self.regularizations = [] - self.loss_weight = force_list(loss_weight) + self.loss_weight = utils.force_list(loss_weight) self.target = [] self.input = [] self.is_local = is_local @@ -50,12 +50,12 @@ class Head(object): with tf.variable_scope(self.get_name(), initializer=tf.contrib.layers.xavier_initializer()): self._build_module(input_layer) - self.output = force_list(self.output) - self.target = force_list(self.target) - self.input = force_list(self.input) - self.loss_type = force_list(self.loss_type) - self.loss = force_list(self.loss) - self.regularizations = force_list(self.regularizations) + self.output = utils.force_list(self.output) + self.target = utils.force_list(self.target) + self.input = utils.force_list(self.input) + self.loss_type = utils.force_list(self.loss_type) + self.loss = utils.force_list(self.loss) + self.regularizations = utils.force_list(self.regularizations) if self.is_local: self.set_loss() self._post_build() diff --git a/architectures/tensorflow_components/middleware.py b/architectures/tensorflow_components/middleware.py index dfe1597..3ef2631 100644 --- a/architectures/tensorflow_components/middleware.py +++ b/architectures/tensorflow_components/middleware.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np diff --git a/architectures/tensorflow_components/shared_variables.py b/architectures/tensorflow_components/shared_variables.py index 2775251..23d179e 100644 --- a/architectures/tensorflow_components/shared_variables.py +++ b/architectures/tensorflow_components/shared_variables.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - import tensorflow as tf import numpy as np @@ -79,4 +78,4 @@ class SharedRunningStats(object): @property def shape(self): - return self._shape \ No newline at end of file + return self._shape diff --git a/coach.py b/coach.py index 8ba8cf3..73f40dc 100644 --- a/coach.py +++ b/coach.py @@ -13,46 +13,42 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -import sys, inspect, re -import os -import json -import presets -from presets import * -from utils import set_gpu, list_all_classes_in_module -from architectures import * -from environments import * -from agents import * -from utils import * -from logger import screen, logger -import argparse -from subprocess import Popen -import datetime -import presets import atexit -import sys +import json +import os +import re import subprocess -from threading import Thread +import sys +import time -if len(set(failed_imports)) > 0: - screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(failed_imports)))) +import agents +import argparse +import configurations as conf +import environments +import logger +import presets +import utils + + +if len(set(logger.failed_imports)) > 0: + logger.screen.warning("Warning: failed to import the following packages - {}".format(', '.join(set(logger.failed_imports)))) def set_framework(framework_type): # choosing neural network framework - framework = Frameworks().get(framework_type) + framework = conf.Frameworks().get(framework_type) sess = None - if framework == Frameworks.TensorFlow: + if framework == conf.Frameworks.TensorFlow: import tensorflow as tf config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=config) - elif framework == Frameworks.Neon: + elif framework == conf.Frameworks.Neon: import ngraph as ng sess = ng.transformers.make_transformer() - screen.log_title("Using {} framework".format(Frameworks().to_string(framework))) + logger.screen.log_title("Using {} framework".format(conf.Frameworks().to_string(framework))) return sess @@ -66,8 +62,8 @@ def check_input_and_fill_run_dict(parser): # list available presets if args.list: - presets_lists = list_all_classes_in_module(presets) - screen.log_title("Available Presets:") + presets_lists = utils.list_all_classes_in_module(presets) + logger.screen.log_title("Available Presets:") for preset in presets_lists: print(preset) sys.exit(0) @@ -77,28 +73,28 @@ def check_input_and_fill_run_dict(parser): # num_workers = int(args.num_workers) num_workers = int(re.match("^\d+$", args.num_workers).group(0)) except ValueError: - screen.error("Parameter num_workers should be an integer.") + logger.screen.error("Parameter num_workers should be an integer.") - preset_names = list_all_classes_in_module(presets) + preset_names = utils.list_all_classes_in_module(presets) if args.preset is not None and args.preset not in preset_names: - screen.error("A non-existing preset was selected. ") + logger.screen.error("A non-existing preset was selected. ") if args.checkpoint_restore_dir is not None and not os.path.exists(args.checkpoint_restore_dir): - screen.error("The requested checkpoint folder to load from does not exist. ") + logger.screen.error("The requested checkpoint folder to load from does not exist. ") if args.save_model_sec is not None: try: args.save_model_sec = int(args.save_model_sec) except ValueError: - screen.error("Parameter save_model_sec should be an integer.") + logger.screen.error("Parameter save_model_sec should be an integer.") if args.preset is None and (args.agent_type is None or args.environment_type is None or args.exploration_policy_type is None) and not args.play: - screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' + logger.screen.error('When no preset is given for Coach to run, the user is expected to input the desired agent_type,' ' environment_type and exploration_policy_type to assemble a preset. ' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type is None: - screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' + logger.screen.error('When no preset is given for Coach to run, and the user requests human control over the environment,' ' the user is expected to input the desired environment_type and level.' '\nAt least one of these parameters was not given.') elif args.preset is None and args.play and args.environment_type: @@ -106,11 +102,11 @@ def check_input_and_fill_run_dict(parser): args.exploration_policy_type = 'ExplorationParameters' # get experiment name and path - experiment_name = logger.get_experiment_name(args.experiment_name) - experiment_path = logger.get_experiment_path(experiment_name) + experiment_name = logger.logger.get_experiment_name(args.experiment_name) + experiment_path = logger.logger.get_experiment_path(experiment_name) if args.play and num_workers > 1: - screen.warning("Playing the game as a human is only available with a single worker. " + logger.screen.warning("Playing the game as a human is only available with a single worker. " "The number of workers will be reduced to 1") num_workers = 1 @@ -123,7 +119,7 @@ def check_input_and_fill_run_dict(parser): run_dict['preset'] = args.preset run_dict['custom_parameter'] = args.custom_parameter run_dict['experiment_path'] = experiment_path - run_dict['framework'] = Frameworks().get(args.framework) + run_dict['framework'] = conf.Frameworks().get(args.framework) run_dict['play'] = args.play run_dict['evaluate'] = args.evaluate# or args.play @@ -251,16 +247,16 @@ if __name__ == "__main__": os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # dump documentation - logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) + logger.logger.set_dump_dir(run_dict['experiment_path'], add_timestamp=True) if not args.no_summary: - atexit.register(logger.summarize_experiment) - screen.change_terminal_title(logger.experiment_name) + atexit.register(logger.logger.summarize_experiment) + logger.screen.change_terminal_title(logger.logger.experiment_name) # Single-threaded runs if run_dict['num_threads'] == 1: # set tuning parameters json_run_dict_path = run_dict_to_json(run_dict) - tuning_parameters = json_to_preset(json_run_dict_path) + tuning_parameters = presets.json_to_preset(json_run_dict_path) tuning_parameters.sess = set_framework(args.framework) if args.print_parameters: @@ -268,8 +264,9 @@ if __name__ == "__main__": # Single-thread runs tuning_parameters.task_index = 0 - env_instance = create_environment(tuning_parameters) - agent = eval(tuning_parameters.agent.type + '(env_instance, tuning_parameters)') + env_instance = environments.create_environment(tuning_parameters) + agent = eval('agents.' + tuning_parameters.agent.type + + '(env_instance, tuning_parameters)') # Start the training or evaluation if tuning_parameters.evaluate: @@ -282,11 +279,11 @@ if __name__ == "__main__": assert args.framework.lower() == 'tensorflow', "Distributed training works only with TensorFlow" os.environ["OMP_NUM_THREADS"]="1" # set parameter server and workers addresses - ps_hosts = "localhost:{}".format(get_open_port()) - worker_hosts = ",".join(["localhost:{}".format(get_open_port()) for i in range(run_dict['num_threads'] + 1)]) + ps_hosts = "localhost:{}".format(utils.get_open_port()) + worker_hosts = ",".join(["localhost:{}".format(utils.get_open_port()) for i in range(run_dict['num_threads'] + 1)]) # Make sure to disable GPU so that all the workers will use the CPU - set_cpu() + utils.set_cpu() # create a parameter server cmd = [ @@ -296,9 +293,9 @@ if __name__ == "__main__": "--worker_hosts={}".format(worker_hosts), "--job_name=ps", ] - parameter_server = Popen(cmd) + parameter_server = subprocess.Popen(cmd) - screen.log_title("*** Distributed Training ***") + logger.screen.log_title("*** Distributed Training ***") time.sleep(1) # create N training workers and 1 evaluating worker @@ -321,7 +318,7 @@ if __name__ == "__main__": "--job_name=worker", "--load_json={}".format(json_run_dict_path)] - p = Popen(workers_args) + p = subprocess.Popen(workers_args) if i != run_dict['num_threads']: workers.append(p) diff --git a/configurations.py b/configurations.py index 5e553d8..f2054a3 100644 --- a/configurations.py +++ b/configurations.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # - -from utils import Enum import json + import types +import utils -class Frameworks(Enum): +class Frameworks(utils.Enum): TensorFlow = 1 Neon = 2 diff --git a/dashboard.py b/dashboard.py index f21b13b..b2aaa0f 100644 --- a/dashboard.py +++ b/dashboard.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2017 Intel Corporation +# Copyright (c) 2017 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,29 +19,24 @@ To run Coach Dashboard, run the following command: python3 dashboard.py """ -from utils import * -import os -import datetime - -import sys -import wx -import random -import pandas as pd -from pandas.io.common import EmptyDataError -import numpy as np import colorsys -from bokeh.palettes import Dark2 -from bokeh.layouts import row, column, widgetbox, Spacer -from bokeh.models import ColumnDataSource, Range1d, LinearAxis, HoverTool, WheelZoomTool, PanTool, Legend -from bokeh.models.widgets import RadioButtonGroup, MultiSelect, Button, Select, Slider, Div, CheckboxGroup -from bokeh.models.glyphs import Patch -from bokeh.plotting import figure, show, curdoc -from utils import force_list -from utils import squeeze_list -from itertools import cycle -from os import listdir -from os.path import isfile, join, isdir, basename -from enum import Enum +import datetime +import enum +import itertools +import os +import random + +from bokeh import palettes +from bokeh import layouts as bl +from bokeh import models as bm +from bokeh.models import widgets as bw +from bokeh import plotting as bp +import numpy as np +import pandas as pd +from pandas.io import pandas_common +import wx + +import utils class DialogApp(wx.App): @@ -67,7 +62,7 @@ class Signal: self.name = name self.full_name = "{}/{}".format(parent.filename, self.name) self.selected = False - self.color = random.choice(Dark2[8]) + self.color = random.choice(palettes.Dark2[8]) self.line = None self.bands = None self.bokeh_source = parent.bokeh_source @@ -79,12 +74,12 @@ class Signal: if (len(name.split('/')) == 1 and name == self.name) or '/'.join(name.split('/')[:-1]) == self.name: self.sub_signals.append(name) if len(self.sub_signals) > 1: - self.mean_signal = squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]]) - self.stdev_signal = squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]]) - self.min_signal = squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]]) - self.max_signal = squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]]) + self.mean_signal = utils.squeeze_list([name for name in self.sub_signals if 'Mean' in name.split('/')[-1]]) + self.stdev_signal = utils.squeeze_list([name for name in self.sub_signals if 'Stdev' in name.split('/')[-1]]) + self.min_signal = utils.squeeze_list([name for name in self.sub_signals if 'Min' in name.split('/')[-1]]) + self.max_signal = utils.squeeze_list([name for name in self.sub_signals if 'Max' in name.split('/')[-1]]) else: - self.mean_signal = squeeze_list(self.name) + self.mean_signal = utils.squeeze_list(self.name) self.stdev_signal = None self.min_signal = None self.max_signal = None @@ -107,16 +102,16 @@ class Signal: if self.selected != val: self.selected = val if self.line: - # self.set_color(Dark2[8][current_color]) - # current_color = (current_color + 1) % len(Dark2[8]) + # self.set_color(palettes.Dark2[8][current_color]) + # current_color = (current_color + 1) % len(palettes.Dark2[8]) self.line.visible = self.selected if self.bands: self.bands.visible = self.selected and self.show_bollinger_bands elif self.selected: # lazy plotting - plot only when selected for the first time show_spinner() - self.set_color(Dark2[8][current_color]) - current_color = (current_color + 1) % len(Dark2[8]) + self.set_color(palettes.Dark2[8][current_color]) + current_color = (current_color + 1) % len(palettes.Dark2[8]) if self.has_bollinger_bands: self.set_bands_source() self.create_bands() @@ -149,7 +144,7 @@ class Signal: if self.bollinger_bands_source: self.bollinger_bands_source.data = source_data else: - self.bollinger_bands_source = ColumnDataSource(source_data) + self.bollinger_bands_source = bm.ColumnDataSource(source_data) def change_bollinger_bands_state(self, new_state): self.show_bollinger_bands = new_state @@ -192,11 +187,11 @@ class SignalsFileBase: def update_source_and_signals(self): # create bokeh data sources - self.bokeh_source_orig = ColumnDataSource(self.csv) + self.bokeh_source_orig = bm.ColumnDataSource(self.csv) self.bokeh_source_orig.data['index'] = self.bokeh_source_orig.data[x_axis] if self.bokeh_source is None: - self.bokeh_source = ColumnDataSource(self.csv) + self.bokeh_source = bm.ColumnDataSource(self.csv) else: # self.bokeh_source.data = self.bokeh_source_orig.data # smooth the data if necessary @@ -282,7 +277,7 @@ class SignalsFile(SignalsFileBase): def __init__(self, csv_path, load=True): SignalsFileBase.__init__(self) self.full_csv_path = csv_path - self.dir, self.filename, _ = break_file_path(csv_path) + self.dir, self.filename, _ = utils.break_file_path(csv_path) if load: self.load() # this helps set the correct x axis @@ -296,7 +291,7 @@ class SignalsFile(SignalsFileBase): try: self.csv = pd.read_csv(self.full_csv_path) break - except EmptyDataError: + except pandas_common.EmptyDataError: self.csv = None continue self.csv = self.csv.interpolate() @@ -327,7 +322,7 @@ class SignalsFilesGroup(SignalsFileBase): else: # get the common directory for all the experiments self.dir = os.path.dirname(os.path.commonprefix(csv_paths)) - self.filename = '{} - Group({})'.format(basename(self.dir), len(self.signals_files)) + self.filename = '{} - Group({})'.format(os.path.basename(self.dir), len(self.signals_files)) self.load() # this helps set the correct x axis @@ -425,7 +420,7 @@ class SignalsFilesGroup(SignalsFileBase): pass -class RunType(Enum): +class RunType(enum.Enum): SINGLE_FOLDER_SINGLE_FILE = 1 SINGLE_FOLDER_MULTIPLE_FILES = 2 MULTIPLE_FOLDERS_SINGLE_FILES = 3 @@ -433,7 +428,7 @@ class RunType(Enum): UNKNOWN = 0 -class FolderType(Enum): +class FolderType(enum.Enum): SINGLE_FILE = 1 MULTIPLE_FILES = 2 MULTIPLE_FOLDERS = 3 @@ -454,24 +449,24 @@ root_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(root_dir, 'spinner.css'), 'r') as f: spinner_style = """""".format(f.read()) spinner_html = """