From 52eb159f696e9cd810072c3a2cc350ae8843f707 Mon Sep 17 00:00:00 2001 From: Itai Caspi <30383381+itaicaspi-intel@users.noreply.github.com> Date: Mon, 23 Apr 2018 10:44:46 +0300 Subject: [PATCH] multiple bug fixes in dealing with measurements + CartPole_DFP preset (#92) --- agents/agent.py | 14 ++++++++++---- agents/dfp_agent.py | 9 ++++++--- environments/doom_environment_wrapper.py | 2 +- environments/gym_environment_wrapper.py | 2 +- presets.py | 13 +++++++++++++ 5 files changed, 31 insertions(+), 9 deletions(-) diff --git a/agents/agent.py b/agents/agent.py index 006544e..2dd6818 100644 --- a/agents/agent.py +++ b/agents/agent.py @@ -365,7 +365,10 @@ class Agent(object): 'observation': observation } if self.tp.agent.use_measurements: - self.curr_state['measurements'] = self.env.measurements + if 'measurements' in self.curr_state.keys(): + self.curr_state['measurements'] = self.env.state['measurements'] + else: + self.curr_state['measurements'] = np.zeros(0) if self.tp.agent.use_accumulated_reward_as_measurement: self.curr_state['measurements'] = np.append(self.curr_state['measurements'], 0) @@ -398,7 +401,7 @@ class Agent(object): shaped_reward += action_info['action_intrinsic_reward'] # TODO: should total_reward_in_current_episode include shaped_reward? self.total_reward_in_current_episode += result['reward'] - next_state = result['state'] + next_state = copy.copy(result['state']) next_state['observation'] = self.preprocess_observation(next_state['observation']) # plot action values online @@ -411,8 +414,11 @@ class Agent(object): observation = LazyStack(self.curr_stack, -1) next_state['observation'] = observation - if self.tp.agent.use_measurements and 'measurements' in result.keys(): - next_state['measurements'] = result['state']['measurements'] + if self.tp.agent.use_measurements: + if 'measurements' in result['state'].keys(): + next_state['measurements'] = result['state']['measurements'] + else: + next_state['measurements'] = np.zeros(0) if self.tp.agent.use_accumulated_reward_as_measurement: next_state['measurements'] = np.append(next_state['measurements'], self.total_reward_in_current_episode) diff --git a/agents/dfp_agent.py b/agents/dfp_agent.py index c055d2c..8f98b94 100644 --- a/agents/dfp_agent.py +++ b/agents/dfp_agent.py @@ -31,7 +31,7 @@ class DFPAgent(Agent): # create the inputs for the network input = current_states - input.append(np.repeat(np.expand_dims(self.current_goal, 0), self.tp.batch_size, 0)) + input['goal'] = np.repeat(np.expand_dims(self.current_goal, 0), self.tp.batch_size, 0) # get the current outputs of the network targets = self.main_network.online_network.predict(input) @@ -40,7 +40,7 @@ class DFPAgent(Agent): for i in range(self.tp.batch_size): targets[i, actions[i]] = batch[i].info['future_measurements'].flatten() - result = self.main_network.train_and_sync_networks(current_states, targets) + result = self.main_network.train_and_sync_networks(input, targets) total_loss = result[0] return total_loss @@ -52,7 +52,10 @@ class DFPAgent(Agent): goal = np.expand_dims(self.current_goal, 0) # predict the future measurements - measurements_future_prediction = self.main_network.online_network.predict([observation, measurements, goal])[0] + measurements_future_prediction = self.main_network.online_network.predict({ + "observation": observation, + "measurements": measurements, + "goal": goal})[0] action_values = np.zeros((self.action_space_size,)) num_steps_used_for_objective = len(self.tp.agent.future_measurements_weights) diff --git a/environments/doom_environment_wrapper.py b/environments/doom_environment_wrapper.py index a0c618d..3483244 100644 --- a/environments/doom_environment_wrapper.py +++ b/environments/doom_environment_wrapper.py @@ -135,7 +135,7 @@ class DoomEnvironmentWrapper(EnvironmentWrapper): # extract all data from the current state state = self.game.get_state() if state is not None and state.screen_buffer is not None: - self.observation = { + self.state = { 'observation': state.screen_buffer, 'measurements': state.game_variables, } diff --git a/environments/gym_environment_wrapper.py b/environments/gym_environment_wrapper.py index c821bf8..cbd260c 100644 --- a/environments/gym_environment_wrapper.py +++ b/environments/gym_environment_wrapper.py @@ -113,7 +113,7 @@ class GymEnvironmentWrapper(EnvironmentWrapper): self.timestep_limit = self.env.spec.timestep_limit else: self.timestep_limit = None - self.measurements_size = len(self.step(0)['info'].keys()) + self.measurements_size = (len(self.step(0)['info'].keys()),) self.random_initialization_steps = self.tp.env.random_initialization_steps def _wrap_state(self, state): diff --git a/presets.py b/presets.py index dcfb765..21841e9 100644 --- a/presets.py +++ b/presets.py @@ -200,6 +200,19 @@ class CartPole_PAL(Preset): self.test_max_step_threshold = 100 self.test_min_return_threshold = 150 + +class CartPole_DFP(Preset): + def __init__(self): + Preset.__init__(self, DFP, GymVectorObservation, ExplorationParameters) + self.env.level = 'CartPole-v0' + self.agent.num_episodes_in_experience_replay = 200 + self.learning_rate = 0.0001 + self.num_heatup_steps = 1000 + self.exploration.epsilon_decay_steps = 10000 + self.agent.use_accumulated_reward_as_measurement = True + self.agent.goal_vector = [1.0] + + class Doom_Basic_DFP(Preset): def __init__(self): Preset.__init__(self, DFP, Doom, ExplorationParameters)