From f9ee526536cc0ffbc42078c53565ffe35c51e1e3 Mon Sep 17 00:00:00 2001 From: Gal Leibovich Date: Sun, 16 Dec 2018 16:06:44 +0200 Subject: [PATCH 1/4] Fix for issue #128 - circular DQN import (#130) --- rl_coach/agents/dqn_agent.py | 1 + rl_coach/base_parameters.py | 3 +++ rl_coach/exploration_policies/parameter_noise.py | 4 ++-- .../filters/observation/observation_normalization_filter.py | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/rl_coach/agents/dqn_agent.py b/rl_coach/agents/dqn_agent.py index a60aac2..b234e88 100644 --- a/rl_coach/agents/dqn_agent.py +++ b/rl_coach/agents/dqn_agent.py @@ -36,6 +36,7 @@ class DQNAlgorithmParameters(AlgorithmParameters): self.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000) self.num_consecutive_playing_steps = EnvironmentSteps(4) self.discount = 0.99 + self.supports_parameter_noise = True class DQNNetworkParameters(NetworkParameters): diff --git a/rl_coach/base_parameters.py b/rl_coach/base_parameters.py index d3b5999..da368c3 100644 --- a/rl_coach/base_parameters.py +++ b/rl_coach/base_parameters.py @@ -211,6 +211,9 @@ class AlgorithmParameters(Parameters): # Should the workers wait for full episode self.act_for_full_episodes = False + # Support for parameter noise + self.supports_parameter_noise = False + class PresetValidationParameters(Parameters): def __init__(self, diff --git a/rl_coach/exploration_policies/parameter_noise.py b/rl_coach/exploration_policies/parameter_noise.py index 34381c4..7854329 100644 --- a/rl_coach/exploration_policies/parameter_noise.py +++ b/rl_coach/exploration_policies/parameter_noise.py @@ -18,7 +18,6 @@ from typing import List, Dict import numpy as np -from rl_coach.agents.dqn_agent import DQNAgentParameters from rl_coach.architectures.layers import NoisyNetDense from rl_coach.base_parameters import AgentParameters, NetworkParameters from rl_coach.spaces import ActionSpace, BoxActionSpace, DiscreteActionSpace @@ -30,7 +29,8 @@ from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, class ParameterNoiseParameters(ExplorationParameters): def __init__(self, agent_params: AgentParameters): super().__init__() - if not isinstance(agent_params, DQNAgentParameters): + + if not agent_params.algorithm.supports_parameter_noise: raise ValueError("Currently only DQN variants are supported for using an exploration type of " "ParameterNoise.") diff --git a/rl_coach/filters/observation/observation_normalization_filter.py b/rl_coach/filters/observation/observation_normalization_filter.py index db9e104..791b345 100644 --- a/rl_coach/filters/observation/observation_normalization_filter.py +++ b/rl_coach/filters/observation/observation_normalization_filter.py @@ -87,3 +87,4 @@ class ObservationNormalizationFilter(ObservationFilter): def restore_state_from_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): self.running_observation_stats.restore_state_from_checkpoint(checkpoint_dir, checkpoint_prefix) + \ No newline at end of file From b4bc8a476ccbb1b595dd93efa6475ce11a4a8a32 Mon Sep 17 00:00:00 2001 From: Neta Zmora <31280975+nzmora@users.noreply.github.com> Date: Mon, 17 Dec 2018 10:08:54 +0200 Subject: [PATCH 2/4] Bug fix: when enabling 'heatup_using_network_decisions', we should add the configured noise (#162) During heatup we may want to add agent-generated-noise (i.e. not "simple" random noise). This is enabled by setting 'heatup_using_network_decisions' to True. For example: agent_params = DDPGAgentParameters() agent_params.algorithm.heatup_using_network_decisions = True The fix ensures that the correct noise is added not just while in the TRAINING phase, but also during the HEATUP phase. No one has enabled 'heatup_using_network_decisions' yet, which explains why this problem arose only now (in my configuration I do enable 'heatup_using_network_decisions'). --- rl_coach/exploration_policies/additive_noise.py | 2 +- rl_coach/exploration_policies/truncated_normal.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rl_coach/exploration_policies/additive_noise.py b/rl_coach/exploration_policies/additive_noise.py index 682021c..5f89889 100644 --- a/rl_coach/exploration_policies/additive_noise.py +++ b/rl_coach/exploration_policies/additive_noise.py @@ -88,7 +88,7 @@ class AdditiveNoise(ExplorationPolicy): action_values_mean = action_values.squeeze() # step the noise schedule - if self.phase == RunPhase.TRAIN: + if self.phase is not RunPhase.TEST: self.noise_percentage_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: diff --git a/rl_coach/exploration_policies/truncated_normal.py b/rl_coach/exploration_policies/truncated_normal.py index bfd0ba1..396f348 100644 --- a/rl_coach/exploration_policies/truncated_normal.py +++ b/rl_coach/exploration_policies/truncated_normal.py @@ -92,7 +92,7 @@ class TruncatedNormal(ExplorationPolicy): action_values_mean = action_values.squeeze() # step the noise schedule - if self.phase == RunPhase.TRAIN: + if self.phase is not RunPhase.TEST: self.noise_percentage_schedule.step() # the second element of the list is assumed to be the standard deviation if isinstance(action_values, list) and len(action_values) > 1: From 4c914c057c02739bee25f8d4599a8260f86da296 Mon Sep 17 00:00:00 2001 From: Gal Leibovich Date: Mon, 17 Dec 2018 21:36:27 +0200 Subject: [PATCH 3/4] fix for finding the right filter checkpoint to restore + do not update internal filter state when evaluating + fix SharedRunningStats checkpoint filenames (#147) --- rl_coach/agents/agent.py | 10 +++++++--- rl_coach/agents/clipped_ppo_agent.py | 2 +- rl_coach/filters/filter.py | 16 ++++++++-------- rl_coach/utilities/shared_running_stats.py | 6 +++--- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/rl_coach/agents/agent.py b/rl_coach/agents/agent.py index 53dc4c3..a6b5297 100644 --- a/rl_coach/agents/agent.py +++ b/rl_coach/agents/agent.py @@ -762,7 +762,8 @@ class Agent(AgentInterface): # informed action if self.pre_network_filter is not None: # before choosing an action, first use the pre_network_filter to filter out the current state - curr_state = self.run_pre_network_filter_for_inference(self.curr_state) + update_filter_internal_state = self.phase is not RunPhase.TEST + curr_state = self.run_pre_network_filter_for_inference(self.curr_state, update_filter_internal_state) else: curr_state = self.curr_state @@ -772,15 +773,18 @@ class Agent(AgentInterface): return filtered_action_info - def run_pre_network_filter_for_inference(self, state: StateType) -> StateType: + def run_pre_network_filter_for_inference(self, state: StateType, update_filter_internal_state: bool=True)\ + -> StateType: """ Run filters which where defined for being applied right before using the state for inference. :param state: The state to run the filters on + :param update_filter_internal_state: Should update the filter's internal state - should not update when evaluating :return: The filtered state """ dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False) - return self.pre_network_filter.filter(dummy_env_response)[0].next_state + return self.pre_network_filter.filter(dummy_env_response, + update_internal_state=update_filter_internal_state)[0].next_state def get_state_embedding(self, state: dict) -> np.ndarray: """ diff --git a/rl_coach/agents/clipped_ppo_agent.py b/rl_coach/agents/clipped_ppo_agent.py index 9fa8d72..71ccdce 100644 --- a/rl_coach/agents/clipped_ppo_agent.py +++ b/rl_coach/agents/clipped_ppo_agent.py @@ -325,7 +325,7 @@ class ClippedPPOAgent(ActorCriticAgent): self.update_log() return None - def run_pre_network_filter_for_inference(self, state: StateType): + def run_pre_network_filter_for_inference(self, state: StateType, update_internal_state: bool=False): dummy_env_response = EnvResponse(next_state=state, reward=0, game_over=False) return self.pre_network_filter.filter(dummy_env_response, update_internal_state=False)[0].next_state diff --git a/rl_coach/filters/filter.py b/rl_coach/filters/filter.py index 6ad2d55..6881f8d 100644 --- a/rl_coach/filters/filter.py +++ b/rl_coach/filters/filter.py @@ -466,14 +466,14 @@ class InputFilter(Filter): if self.name is not None: checkpoint_prefix = '.'.join([checkpoint_prefix, self.name]) for filter_name, filter in self._reward_filters.items(): - checkpoint_prefix = '.'.join([checkpoint_prefix, 'reward_filters', filter_name]) - filter.save_state_to_checkpoint(checkpoint_dir, checkpoint_prefix) + curr_reward_filter_ckpt_prefix = '.'.join([checkpoint_prefix, 'reward_filters', filter_name]) + filter.save_state_to_checkpoint(checkpoint_dir, curr_reward_filter_ckpt_prefix) for observation_name, filters_dict in self._observation_filters.items(): for filter_name, filter in filters_dict.items(): - checkpoint_prefix = '.'.join([checkpoint_prefix, 'observation_filters', observation_name, + curr_obs_filter_ckpt_prefix = '.'.join([checkpoint_prefix, 'observation_filters', observation_name, filter_name]) - filter.save_state_to_checkpoint(checkpoint_dir, checkpoint_prefix) + filter.save_state_to_checkpoint(checkpoint_dir, curr_obs_filter_ckpt_prefix) def restore_state_from_checkpoint(self, checkpoint_dir, checkpoint_prefix)->None: """ @@ -486,14 +486,14 @@ class InputFilter(Filter): if self.name is not None: checkpoint_prefix = '.'.join([checkpoint_prefix, self.name]) for filter_name, filter in self._reward_filters.items(): - checkpoint_prefix = '.'.join([checkpoint_prefix, 'reward_filters', filter_name]) - filter.restore_state_from_checkpoint(checkpoint_dir, checkpoint_prefix) + curr_reward_filter_ckpt_prefix = '.'.join([checkpoint_prefix, 'reward_filters', filter_name]) + filter.restore_state_from_checkpoint(checkpoint_dir, curr_reward_filter_ckpt_prefix) for observation_name, filters_dict in self._observation_filters.items(): for filter_name, filter in filters_dict.items(): - checkpoint_prefix = '.'.join([checkpoint_prefix, 'observation_filters', observation_name, + curr_obs_filter_ckpt_prefix = '.'.join([checkpoint_prefix, 'observation_filters', observation_name, filter_name]) - filter.restore_state_from_checkpoint(checkpoint_dir, checkpoint_prefix) + filter.restore_state_from_checkpoint(checkpoint_dir, curr_obs_filter_ckpt_prefix) class NoInputFilter(InputFilter): diff --git a/rl_coach/utilities/shared_running_stats.py b/rl_coach/utilities/shared_running_stats.py index 7f1176f..b78b66b 100644 --- a/rl_coach/utilities/shared_running_stats.py +++ b/rl_coach/utilities/shared_running_stats.py @@ -109,13 +109,13 @@ class SharedRunningStats(ABC): def restore_state_from_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): pass - def get_latest_checkpoint(self, checkpoint_dir: str) -> str: + def get_latest_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str) -> str: latest_checkpoint_id = -1 latest_checkpoint = '' # get all checkpoint files for fname in os.listdir(checkpoint_dir): path = os.path.join(checkpoint_dir, fname) - if os.path.isdir(path) or fname.split('.')[-1] != 'srs': + if os.path.isdir(path) or fname.split('.')[-1] != 'srs' or checkpoint_prefix not in fname: continue checkpoint_id = int(fname.split('_')[0]) if checkpoint_id > latest_checkpoint_id: @@ -189,7 +189,7 @@ class NumpySharedRunningStats(SharedRunningStats): pickle.dump(dict_to_save, f, pickle.HIGHEST_PROTOCOL) def restore_state_from_checkpoint(self, checkpoint_dir: str, checkpoint_prefix: str): - latest_checkpoint_filename = self.get_latest_checkpoint(checkpoint_dir) + latest_checkpoint_filename = self.get_latest_checkpoint(checkpoint_dir, checkpoint_prefix) if latest_checkpoint_filename == '': raise ValueError("Could not find NumpySharedRunningStats checkpoint file. ") From 8e3ee818f865388f5ba7abd2cfe89a57b70419d7 Mon Sep 17 00:00:00 2001 From: Zach Dwiel Date: Fri, 21 Dec 2018 10:10:31 -0500 Subject: [PATCH 4/4] update circle ci config to match new golden test presets (#167) --- .circleci/config.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 96d51b6..2e1cdf1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -175,8 +175,8 @@ jobs: - run: name: run gym related golden tests command: | - export PRESETS='CartPole_A3C,CartPole_Dueling_DDQN,CartPole_NStepQ,CartPole_DQN,CartPole_DFP,CartPole_PG,CartPole_NEC,CartPole_ClippedPPO,CartPole_PAL' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-gym -tc "export PRESETS=${PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-gym_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export GOLDEN_PRESETS='CartPole' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-gym -tc "export GOLDEN_PRESETS=${GOLDEN_PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-gym_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -196,8 +196,8 @@ jobs: - run: name: run doom related golden tests command: | - export PRESETS='Doom_Basic_DQN,Doom_Basic_A3C,Doom_Health_DFP' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-doom -tc "export PRESETS=${PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-doom_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export GOLDEN_PRESETS='Doom' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-doom -tc "export GOLDEN_PRESETS=${GOLDEN_PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-doom_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -217,8 +217,8 @@ jobs: - run: name: run mujoco related golden tests command: | - export PRESETS='BitFlip_DQN_HER,BitFlip_DQN,Mujoco_A3C,Mujoco_A3C_LSTM,Mujoco_PPO,Mujoco_ClippedPPO,Mujoco_DDPG' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-mujoco -tc "export PRESETS=${PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-mujoco_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export GOLDEN_PRESETS='BitFlip or Mujoco' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn golden-test-mujoco -tc "export GOLDEN_PRESETS=${GOLDEN_PRESETS} && make golden_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-mujoco_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -238,8 +238,8 @@ jobs: - run: name: run gym related trace tests command: | - export PRESETS='CartPole_A3C,CartPole_Dueling_DDQN,CartPole_NStepQ,CartPole_DQN,CartPole_DFP,CartPole_PG,CartPole_NEC,CartPole_ClippedPPO,CartPole_PAL' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-gym -tc "export PRESETS=${PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-gym_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export TRACE_PRESETS='CartPole_A3C,CartPole_Dueling_DDQN,CartPole_NStepQ,CartPole_DQN,CartPole_DFP,CartPole_PG,CartPole_NEC,CartPole_ClippedPPO,CartPole_PAL' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-gym -tc "export TRACE_PRESETS=${TRACE_PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-gym_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -259,8 +259,8 @@ jobs: - run: name: run doom related trace tests command: | - export PRESETS='Doom_Basic_DQN,Doom_Basic_A3C,Doom_Health_DFP' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-doom -tc "export PRESETS=${PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-doom_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export TRACE_PRESETS='Doom_Basic_DQN,Doom_Basic_A3C,Doom_Health_DFP' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-doom -tc "export TRACE_PRESETS=${TRACE_PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-doom_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -280,8 +280,8 @@ jobs: - run: name: run mujoco related trace tests command: | - export PRESETS='BitFlip_DQN_HER,BitFlip_DQN,Mujoco_A3C,Mujoco_A3C_LSTM,Mujoco_PPO,Mujoco_ClippedPPO,Mujoco_DDPG' - python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-mujoco -tc "export PRESETS=${PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-mujoco_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 + export TRACE_PRESETS='BitFlip_DQN_HER,BitFlip_DQN,Mujoco_A3C,Mujoco_A3C_LSTM,Mujoco_PPO,Mujoco_ClippedPPO,Mujoco_DDPG' + python3 rl_coach/tests/test_eks.py -c coach-test -bn ${CIRCLE_BUILD_NUM} -tn trace-test-mujoco -tc "export TRACE_PRESETS=${TRACE_PRESETS} && make trace_tests_without_docker" -i 316971102342.dkr.ecr.us-west-2.amazonaws.com/coach-mujoco_environment:$(git describe --tags --always --dirty) -cpu 2048 -mem 4096 no_output_timeout: 30m - run: name: cleanup @@ -315,7 +315,7 @@ jobs: docker pull ${REGISTRY}/coach:${TAG} docker tag ${REGISTRY}/coach:${TAG} ${REGISTRY}/coach:${MASTER_BRANCH} docker push ${REGISTRY}/coach:${MASTER_BRANCH} - + workflows: version: 2