mirror of
https://github.com/gryf/coach.git
synced 2026-02-26 04:05:49 +01:00
pre-release 0.10.0
This commit is contained in:
53
rl_coach/presets/Atari_A3C.py
Normal file
53
rl_coach/presets/Atari_A3C.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SingleLevelSelection, SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20
|
||||
agent_params.algorithm.beta_entropy = 0.05
|
||||
|
||||
agent_params.network_wrappers['main'].middleware_parameters = FCMiddlewareParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
|
||||
agent_params.exploration = CategoricalParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
55
rl_coach/presets/Atari_A3C_LSTM.py
Normal file
55
rl_coach/presets/Atari_A3C_LSTM.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.lstm_middleware import LSTMMiddlewareParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, MiddlewareScheme, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SingleLevelSelection, SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, AtariInputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(10000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20
|
||||
agent_params.algorithm.beta_entropy = 0.05
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].middleware_parameters = LSTMMiddlewareParameters(scheme=MiddlewareScheme.Medium,
|
||||
number_of_lstm_cells=256)
|
||||
agent_params.input_filter = AtariInputFilter()
|
||||
agent_params.input_filter.remove_observation_filter('observation', 'stacking')
|
||||
agent_params.exploration = CategoricalParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = True
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
44
rl_coach/presets/Atari_Bootstrapped_DQN.py
Normal file
44
rl_coach/presets/Atari_Bootstrapped_DQN.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = BootstrappedDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
43
rl_coach/presets/Atari_C51.py
Normal file
43
rl_coach/presets/Atari_C51.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = CategoricalDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
43
rl_coach/presets/Atari_DDQN.py
Normal file
43
rl_coach/presets/Atari_DDQN.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
48
rl_coach/presets/Atari_DDQN_with_PER.py
Normal file
48
rl_coach/presets/Atari_DDQN_with_PER.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025/4
|
||||
agent_params.memory = PrioritizedExperienceReplayParameters()
|
||||
agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
43
rl_coach/presets/Atari_DQN.py
Normal file
43
rl_coach/presets/Atari_DQN.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
47
rl_coach/presets/Atari_DQN_with_PER.py
Normal file
47
rl_coach/presets/Atari_DQN_with_PER.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.memory = PrioritizedExperienceReplayParameters()
|
||||
agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
50
rl_coach/presets/Atari_Dueling_DDQN.py
Normal file
50
rl_coach/presets/Atari_Dueling_DDQN.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import math
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, MiddlewareScheme, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Empty
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.network_wrappers['main'].rescale_gradient_from_head_by_factor = [1/math.sqrt(2)]
|
||||
agent_params.network_wrappers['main'].clip_gradients = 10
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
57
rl_coach/presets/Atari_Dueling_DDQN_with_PER_OpenAI.py
Normal file
57
rl_coach/presets/Atari_Dueling_DDQN_with_PER_OpenAI.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, MiddlewareScheme, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters
|
||||
from rl_coach.schedules import LinearSchedule, PieceWiseSchedule, ConstantSchedule
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Empty
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.network_wrappers['main'].clip_gradients = 10
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(40000)
|
||||
agent_params.exploration.epsilon_schedule = PieceWiseSchedule(
|
||||
[(LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
|
||||
(LinearSchedule(0.1, 0.01, 10000000), EnvironmentSteps(1000000)),
|
||||
(ConstantSchedule(0.001), EnvironmentSteps(10000000))]
|
||||
)
|
||||
agent_params.memory = PrioritizedExperienceReplayParameters()
|
||||
agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
47
rl_coach/presets/Atari_NEC.py
Normal file
47
rl_coach/presets/Atari_NEC.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SingleLevelSelection, SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari, AtariInputFilter, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.nec_agent import NECAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(10000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(2000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = NECAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00001
|
||||
agent_params.input_filter = AtariInputFilter()
|
||||
agent_params.input_filter.remove_reward_filter('clipping')
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
env_params.random_initialization_steps = 1
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
48
rl_coach/presets/Atari_NStepQ.py
Normal file
48
rl_coach/presets/Atari_NStepQ.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Conv2d, Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SingleLevelSelection, SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.n_step_q_agent import NStepQAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = NStepQAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Conv2d([16, 8, 4]),
|
||||
Conv2d([32, 4, 2])]
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([256])]
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
44
rl_coach/presets/Atari_QR_DQN.py
Normal file
44
rl_coach/presets/Atari_QR_DQN.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.qr_dqn_agent import QuantileRegressionDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = QuantileRegressionDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00005 # called alpha in the paper
|
||||
agent_params.algorithm.huber_loss_interval = 1 # k = 0 for strict quantile loss, k = 1 for Huber quantile loss
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
45
rl_coach/presets/Atari_UCB_with_Q_Ensembles.py
Normal file
45
rl_coach/presets/Atari_UCB_with_Q_Ensembles.py
Normal file
@@ -0,0 +1,45 @@
|
||||
from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.ucb import UCBParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(50000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(135000)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = BootstrappedDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.exploration = UCBParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = SingleLevelSelection(atari_deterministic_v4)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'alien']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
68
rl_coach/presets/BitFlip_DQN.py
Normal file
68
rl_coach/presets/BitFlip_DQN.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, InputEmbedderParameters, \
|
||||
PresetValidationParameters
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
|
||||
bit_length = 8
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(400000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(16 * 50) # 50 cycles
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.001
|
||||
agent_params.network_wrappers['main'].batch_size = 128
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([256])]
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters = {
|
||||
'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
|
||||
}
|
||||
agent_params.algorithm.discount = 0.98
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16)
|
||||
agent_params.algorithm.num_consecutive_training_steps = 40
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
|
||||
agent_params.algorithm.rate_for_copying_weights_to_target = 0.05
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6)
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'rl_coach.environments.toy_problems.bit_flip:BitFlip'
|
||||
env_params.additional_simulator_parameters = {'bit_length': bit_length, 'mean_zero': True}
|
||||
# env_params.custom_reward_threshold = -bit_length + 1
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = -7.9
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 10000
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
# self.algorithm.add_intrinsic_reward_for_reaching_the_goal = False
|
||||
|
||||
81
rl_coach/presets/BitFlip_DQN_HER.py
Normal file
81
rl_coach/presets/BitFlip_DQN_HER.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, InputEmbedderParameters, \
|
||||
PresetValidationParameters
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import \
|
||||
EpisodicHindsightExperienceReplayParameters, HindsightGoalSelectionMethod
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
from rl_coach.spaces import GoalsSpace, ReachingGoal
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
|
||||
bit_length = 20
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(16 * 50 * 200) # 200 epochs
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(16 * 50) # 50 cycles
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.001
|
||||
agent_params.network_wrappers['main'].batch_size = 128
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([256])]
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters = {
|
||||
'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
|
||||
agent_params.algorithm.discount = 0.98
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16)
|
||||
agent_params.algorithm.num_consecutive_training_steps = 40
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
|
||||
agent_params.algorithm.rate_for_copying_weights_to_target = 0.05
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6)
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
|
||||
agent_params.memory = EpisodicHindsightExperienceReplayParameters()
|
||||
agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Final
|
||||
agent_params.memory.hindsight_transitions_per_regular_transition = 1
|
||||
agent_params.memory.goals_space = GoalsSpace(goal_name='state',
|
||||
reward_type=ReachingGoal(distance_from_goal_threshold=0,
|
||||
goal_reaching_reward=0,
|
||||
default_reward=-1),
|
||||
distance_metric=GoalsSpace.DistanceMetric.Euclidean)
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'rl_coach.environments.toy_problems.bit_flip:BitFlip'
|
||||
env_params.additional_simulator_parameters = {'bit_length': bit_length, 'mean_zero': True}
|
||||
env_params.custom_reward_threshold = -bit_length + 1
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
|
||||
# currently no tests for this preset as the max reward can be accidently achieved. will be fixed with trace based tests.
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = -15
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 10000
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
# self.algorithm.add_intrinsic_reward_for_reaching_the_goal = False
|
||||
|
||||
61
rl_coach/presets/Carla_3_Cameras_DDPG.py
Normal file
61
rl_coach/presets/Carla_3_Cameras_DDPG.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import copy
|
||||
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.carla_environment import CarlaEnvironmentParameters, CameraTypes, CarlaInputFilter
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDPGAgentParameters()
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
|
||||
# front camera
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['forward_camera'] = \
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['forward_camera'] = \
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')
|
||||
|
||||
# left camera
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['left_camera'] = \
|
||||
copy.deepcopy(agent_params.network_wrappers['actor'].input_embedders_parameters['forward_camera'])
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['left_camera'] = \
|
||||
copy.deepcopy(agent_params.network_wrappers['critic'].input_embedders_parameters['forward_camera'])
|
||||
|
||||
# right camera
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['right_camera'] = \
|
||||
copy.deepcopy(agent_params.network_wrappers['actor'].input_embedders_parameters['forward_camera'])
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['right_camera'] = \
|
||||
copy.deepcopy(agent_params.network_wrappers['critic'].input_embedders_parameters['forward_camera'])
|
||||
|
||||
agent_params.input_filter = CarlaInputFilter()
|
||||
agent_params.input_filter.copy_filters_from_one_observation_to_another('forward_camera', 'left_camera')
|
||||
agent_params.input_filter.copy_filters_from_one_observation_to_another('forward_camera', 'right_camera')
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = CarlaEnvironmentParameters()
|
||||
env_params.level = 'town1'
|
||||
env_params.cameras = [CameraTypes.FRONT, CameraTypes.LEFT, CameraTypes.RIGHT]
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
40
rl_coach/presets/Carla_DDPG.py
Normal file
40
rl_coach/presets/Carla_DDPG.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.carla_environment import CarlaEnvironmentParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDPGAgentParameters()
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['forward_camera'] = \
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['forward_camera'] = \
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = CarlaEnvironmentParameters()
|
||||
env_params.level = 'town1'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
51
rl_coach/presets/Carla_Dueling_DDQN.py
Normal file
51
rl_coach/presets/Carla_Dueling_DDQN.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import math
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, MiddlewareScheme
|
||||
from rl_coach.environments.carla_environment import CarlaEnvironmentParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.filters.action.box_discretization import BoxDiscretization
|
||||
from rl_coach.filters.filter import OutputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Empty
|
||||
agent_params.network_wrappers['main'].rescale_gradient_from_head_by_factor = [1/math.sqrt(2), 1/math.sqrt(2)]
|
||||
agent_params.network_wrappers['main'].clip_gradients = 10
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['forward_camera'] = \
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters.pop('observation')
|
||||
agent_params.output_filter = OutputFilter()
|
||||
agent_params.output_filter.add_action_filter('discretization', BoxDiscretization(5))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = CarlaEnvironmentParameters()
|
||||
env_params.level = 'town1'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
63
rl_coach/presets/CartPole_A3C.py
Normal file
63
rl_coach/presets/CartPole_A3C.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import MujocoInputFilter, Mujoco
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
|
||||
agent_params.algorithm.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 5
|
||||
agent_params.algorithm.gae_lambda = 1
|
||||
agent_params.algorithm.beta_entropy = 0.01
|
||||
|
||||
agent_params.network_wrappers['main'].optimizer_type = 'Adam'
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
|
||||
|
||||
agent_params.exploration = CategoricalParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 300
|
||||
preset_validation_params.num_workers = 8
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
58
rl_coach/presets/CartPole_DFP.py
Normal file
58
rl_coach/presets/CartPole_DFP.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters, HandlingTargetsAfterEpisodeEnd
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(100)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DFPAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['goal'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].scheme = EmbedderScheme.Medium
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0.01, 3000)
|
||||
agent_params.exploration.evaluation_epsilon = 0.01
|
||||
agent_params.algorithm.discount = 1.0
|
||||
agent_params.algorithm.use_accumulated_reward_as_measurement = True
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
agent_params.algorithm.goal_vector = [1] # accumulated_reward
|
||||
agent_params.algorithm.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.LastStep
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
62
rl_coach/presets/CartPole_DQN.py
Normal file
62
rl_coach/presets/CartPole_DQN.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
|
||||
# DQN params
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
|
||||
# NN configuration
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
|
||||
# ER size
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000)
|
||||
|
||||
# E-Greedy schedule
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)
|
||||
|
||||
################
|
||||
# Environment #
|
||||
################
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
67
rl_coach/presets/CartPole_Dueling_DDQN.py
Normal file
67
rl_coach/presets/CartPole_Dueling_DDQN.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import math
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
|
||||
# DDQN params
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
|
||||
# NN configuration
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.network_wrappers['main'].rescale_gradient_from_head_by_factor = [1/math.sqrt(2), 1/math.sqrt(2)]
|
||||
|
||||
# ER size
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000)
|
||||
|
||||
# E-Greedy schedule
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)
|
||||
|
||||
################
|
||||
# Environment #
|
||||
################
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
57
rl_coach/presets/CartPole_NEC.py
Normal file
57
rl_coach/presets/CartPole_NEC.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from rl_coach.agents.nec_agent import NECAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari, MujocoInputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1300)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
|
||||
agent_params = NECAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0.1, 1000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.memory.max_size = (MemoryGranularity.Episodes, 200)
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 300
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
52
rl_coach/presets/CartPole_NStepQ.py
Normal file
52
rl_coach/presets/CartPole_NStepQ.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from rl_coach.agents.n_step_q_agent import NStepQAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import MujocoInputFilter, Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = NStepQAgentParameters()
|
||||
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 200
|
||||
preset_validation_params.num_workers = 8
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
63
rl_coach/presets/CartPole_PAL.py
Normal file
63
rl_coach/presets/CartPole_PAL.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from rl_coach.agents.pal_agent import PALAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = PALAgentParameters()
|
||||
|
||||
# DQN params
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
|
||||
# NN configuration
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
|
||||
# ER size
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000)
|
||||
|
||||
# E-Greedy schedule
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)
|
||||
|
||||
################
|
||||
# Environment #
|
||||
################
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 150
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
59
rl_coach/presets/CartPole_PG.py
Normal file
59
rl_coach/presets/CartPole_PG.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from rl_coach.agents.policy_gradients_agent import PolicyGradientsAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import MujocoInputFilter, Mujoco
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = PolicyGradientsAgentParameters()
|
||||
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 5
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20000
|
||||
|
||||
agent_params.network_wrappers['main'].optimizer_type = 'Adam'
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0005
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
|
||||
|
||||
agent_params.exploration = CategoricalParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'CartPole-v0'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 130
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 550
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
63
rl_coach/presets/ControlSuite_DDPG.py
Normal file
63
rl_coach/presets/ControlSuite_DDPG.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, PresetValidationParameters
|
||||
from rl_coach.environments.control_suite_environment import ControlSuiteEnvironmentParameters, control_suite_envs
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import MujocoInputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDPGAgentParameters()
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'].scheme = [Dense([300])]
|
||||
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([200])]
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'].scheme = [Dense([400])]
|
||||
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])]
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1/10.))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = ControlSuiteEnvironmentParameters()
|
||||
env_params.level = SingleLevelSelection(control_suite_envs)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
# this preset is too slow to test on a regular basis
|
||||
|
||||
# preset_validation_params = PresetValidationParameters()
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 150
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 250
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,)
|
||||
# preset_validation_params=preset_validation_params)
|
||||
64
rl_coach/presets/Doom_Basic_A3C.py
Normal file
64
rl_coach/presets/Doom_Basic_A3C.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.environments.gym_environment import MujocoInputFilter
|
||||
from rl_coach.exploration_policies.categorical import CategoricalParameters
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
agent_params.algorithm.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/100.))
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 30
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.gae_lambda = 1.0
|
||||
agent_params.algorithm.beta_entropy = 0.01
|
||||
agent_params.network_wrappers['main'].clip_gradients = 40.
|
||||
agent_params.exploration = CategoricalParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'basic'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 20
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 400
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
43
rl_coach/presets/Doom_Basic_BC.py
Normal file
43
rl_coach/presets/Doom_Basic_BC.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from rl_coach.agents.bc_agent import BCAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = TrainingSteps(500)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = BCAgentParameters()
|
||||
# agent_params.memory.max_size = (MemoryGranularity.Episodes, 1000)
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0005
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000)
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 50000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
agent_params.network_wrappers['main'].batch_size = 120
|
||||
agent_params.memory.load_memory_from_file_path = 'datasets/doom_basic.p'
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'basic'
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=VisualizationParameters())
|
||||
51
rl_coach/presets/Doom_Basic_DFP.py
Normal file
51
rl_coach/presets/Doom_Basic_DFP.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters, HandlingTargetsAfterEpisodeEnd
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
|
||||
# There is no heatup for DFP. heatup length is determined according to batch size. See below.
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DFPAgentParameters()
|
||||
schedule_params.heatup_steps = EnvironmentSteps(agent_params.network_wrappers['main'].batch_size)
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0, 10000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
|
||||
# this works better than the default which is 64
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
|
||||
agent_params.algorithm.use_accumulated_reward_as_measurement = True
|
||||
agent_params.algorithm.goal_vector = [0, 1] # ammo, accumulated_reward
|
||||
agent_params.algorithm.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.LastStep
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'basic'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
57
rl_coach/presets/Doom_Basic_DQN.py
Normal file
57
rl_coach/presets/Doom_Basic_DQN.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from rl_coach.agents.dqn_agent import DQNAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DQNAgentParameters()
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 5000)
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000)
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 50000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'basic'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 20
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 400
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
48
rl_coach/presets/Doom_Basic_Dueling_DDQN.py
Normal file
48
rl_coach/presets/Doom_Basic_Dueling_DDQN.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 5000)
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000)
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'basic'
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
57
rl_coach/presets/Doom_Battle_DFP.py
Normal file
57
rl_coach/presets/Doom_Battle_DFP.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters, DoomEnvironment
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(6250000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(62500)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(6250)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DFPAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
# the original DFP code decays epsilon in ~1.5M steps. Only that unlike other most other papers, these are 1.5M
|
||||
# training steps. i.e. it is equivalent to once every 8 playing steps (when a training batch is sampled).
|
||||
# so this is 1.5M*8 =~ 12M playing steps per worker.
|
||||
# TODO allow the epsilon schedule to be defined in terms of training steps.
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1, 0, 12000000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.use_accumulated_reward_as_measurement = False
|
||||
agent_params.algorithm.goal_vector = [0.5, 0.5, 1] # ammo, health, frag count
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_rescaling['vector'] = 100.
|
||||
agent_params.algorithm.scale_measurements_targets['GameVariable.HEALTH'] = 30.0
|
||||
agent_params.algorithm.scale_measurements_targets['GameVariable.AMMO2'] = 7.5
|
||||
agent_params.algorithm.scale_measurements_targets['GameVariable.USER2'] = 1.0
|
||||
agent_params.network_wrappers['main'].learning_rate_decay_rate = 0.3
|
||||
agent_params.network_wrappers['main'].learning_rate_decay_steps = 250000
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_offset['vector'] = 0.5
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].input_offset['vector'] = 0.5
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'BATTLE_COACH_LOCAL'
|
||||
env_params.cameras = [DoomEnvironment.CameraTypes.OBSERVATION]
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
68
rl_coach/presets/Doom_Health_DFP.py
Normal file
68
rl_coach/presets/Doom_Health_DFP.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, MiddlewareScheme, \
|
||||
PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(6250000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(62500)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(6250)
|
||||
|
||||
# There is no heatup for DFP. heatup length is determined according to batch size. See below.
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DFPAgentParameters()
|
||||
schedule_params.heatup_steps = EnvironmentSteps(agent_params.network_wrappers['main'].batch_size)
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0, 10000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.goal_vector = [1] # health
|
||||
|
||||
|
||||
# scale observation and measurements to be -0.5 <-> 0.5
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_rescaling['vector'] = 100.
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_offset['vector'] = 0.5
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].input_offset['vector'] = 0.5
|
||||
|
||||
# changing the network scheme to match Coach's default network, as it performs better on this preset
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['goal'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Medium
|
||||
|
||||
# scale the target measurements according to the paper (dividing by standard deviation)
|
||||
agent_params.algorithm.scale_measurements_targets['GameVariable.HEALTH'] = 30.0
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'HEALTH_GATHERING'
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 1600
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 70
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
59
rl_coach/presets/Doom_Health_MMC.py
Normal file
59
rl_coach/presets/Doom_Health_MMC.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters
|
||||
from rl_coach.agents.mmc_agent import MixedMonteCarloAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, MiddlewareScheme, \
|
||||
PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(5)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = MixedMonteCarloAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0, 10000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.memory.max_size = (MemoryGranularity.Episodes, 200)
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000)
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1)
|
||||
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'HEALTH_GATHERING'
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
|
||||
# disabling this test for now, as it takes too long to converge
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 1000
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 300
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
67
rl_coach/presets/Doom_Health_Supreme_DFP.py
Normal file
67
rl_coach/presets/Doom_Health_Supreme_DFP.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from rl_coach.agents.dfp_agent import DFPAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, MiddlewareScheme, \
|
||||
PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(6250000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(62500)
|
||||
schedule_params.evaluation_steps = EnvironmentSteps(6250)
|
||||
|
||||
# There is no heatup for DFP. heatup length is determined according to batch size. See below.
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DFPAgentParameters()
|
||||
schedule_params.heatup_steps = EnvironmentSteps(agent_params.network_wrappers['main'].batch_size)
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0, 10000)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
agent_params.algorithm.goal_vector = [1] # health
|
||||
|
||||
# scale observation and measurements to be -0.5 <-> 0.5
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_rescaling['vector'] = 100.
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].input_offset['vector'] = 0.5
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].input_offset['vector'] = 0.5
|
||||
|
||||
# changing the network scheme to match Coach's default network, as it performs better on this preset
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['measurements'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['goal'].scheme = EmbedderScheme.Medium
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Medium
|
||||
|
||||
# scale the target measurements according to the paper (dividing by standard deviation)
|
||||
agent_params.algorithm.scale_measurements_targets['GameVariable.HEALTH'] = 30.0
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = DoomEnvironmentParameters()
|
||||
env_params.level = 'HEALTH_GATHERING_SUPREME_COACH_LOCAL'
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 1600
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 70
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
66
rl_coach/presets/ExplorationChain_Bootstrapped_DQN.py
Normal file
66
rl_coach/presets/ExplorationChain_Bootstrapped_DQN.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
|
||||
from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
|
||||
|
||||
N = 20
|
||||
num_output_head_copies = 20
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(2000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(N)
|
||||
|
||||
####################
|
||||
# DQN Agent Params #
|
||||
####################
|
||||
agent_params = BootstrappedDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.network_wrappers['main'].num_output_head_copies = num_output_head_copies
|
||||
agent_params.network_wrappers['main'].rescale_gradient_from_head_by_factor = [1.0/num_output_head_copies]*num_output_head_copies
|
||||
agent_params.exploration.bootstrapped_data_sharing_probability = 1.0
|
||||
agent_params.exploration.architecture_num_q_heads = num_output_head_copies
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0)
|
||||
agent_params.input_filter = NoInputFilter()
|
||||
agent_params.output_filter = NoOutputFilter()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'rl_coach.environments.toy_problems.exploration_chain:ExplorationChain'
|
||||
|
||||
env_params.additional_simulator_parameters = {'chain_length': N, 'max_steps': N+7}
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
|
||||
# currently no test here as bootstrapped_dqn seems to be broken
|
||||
|
||||
# preset_validation_params = PresetValidationParameters()
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 1600
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 70
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,)
|
||||
# preset_validation_params=preset_validation_params)
|
||||
56
rl_coach/presets/ExplorationChain_Dueling_DDQN.py
Normal file
56
rl_coach/presets/ExplorationChain_Dueling_DDQN.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.gym_environment import GymEnvironmentParameters
|
||||
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
|
||||
|
||||
N = 20
|
||||
num_output_head_copies = 20
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(2000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(N)
|
||||
|
||||
####################
|
||||
# DQN Agent Params #
|
||||
####################
|
||||
agent_params = DDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1, 0.1, (N+7)*2000)
|
||||
agent_params.input_filter = NoInputFilter()
|
||||
agent_params.output_filter = NoOutputFilter()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = GymEnvironmentParameters()
|
||||
env_params.level = 'rl_coach.environments.toy_problems.exploration_chain:ExplorationChain'
|
||||
env_params.additional_simulator_parameters = {'chain_length': N, 'max_steps': N+7}
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
|
||||
|
||||
# preset_validation_params = PresetValidationParameters()
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 1600
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 70
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,)
|
||||
# preset_validation_params=preset_validation_params)
|
||||
55
rl_coach/presets/ExplorationChain_UCB_Q_ensembles.py
Normal file
55
rl_coach/presets/ExplorationChain_UCB_Q_ensembles.py
Normal file
@@ -0,0 +1,55 @@
|
||||
from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.exploration_policies.ucb import UCBParameters
|
||||
|
||||
N = 20
|
||||
num_output_head_copies = 20
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(2000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(N)
|
||||
|
||||
####################
|
||||
# DQN Agent Params #
|
||||
####################
|
||||
agent_params = BootstrappedDQNAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.network_wrappers['main'].num_output_head_copies = num_output_head_copies
|
||||
agent_params.network_wrappers['main'].rescale_gradient_from_head_by_factor = [1.0/num_output_head_copies]*num_output_head_copies
|
||||
agent_params.exploration = UCBParameters()
|
||||
agent_params.exploration.bootstrapped_data_sharing_probability = 1.0
|
||||
agent_params.exploration.architecture_num_q_heads = num_output_head_copies
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0)
|
||||
agent_params.exploration.lamb = 10
|
||||
agent_params.input_filter = NoInputFilter()
|
||||
agent_params.output_filter = NoOutputFilter()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = 'rl_coach.environments.toy_problems.exploration_chain:ExplorationChain'
|
||||
|
||||
env_params.additional_simulator_parameters = {'chain_length': N, 'max_steps': N+7}
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
130
rl_coach/presets/Fetch_DDPG_HER_baselines.py
Normal file
130
rl_coach/presets/Fetch_DDPG_HER_baselines.py
Normal file
@@ -0,0 +1,130 @@
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.fc_middleware import FCMiddlewareParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme, InputEmbedderParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod, MaxDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, MujocoInputFilter, fetch_v1
|
||||
from rl_coach.filters.observation.observation_clipping_filter import ObservationClippingFilter
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, \
|
||||
HindsightGoalSelectionMethod
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
from rl_coach.spaces import GoalsSpace, ReachingGoal
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
|
||||
cycles = 100 # 20 for reach. for others it's 100
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(cycles * 200) # 200 epochs
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(cycles) # 50 cycles
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
################
|
||||
# Agent Params #
|
||||
################
|
||||
agent_params = DDPGAgentParameters()
|
||||
|
||||
# actor
|
||||
actor_network = agent_params.network_wrappers['actor']
|
||||
actor_network.learning_rate = 0.001
|
||||
actor_network.batch_size = 256
|
||||
actor_network.optimizer_epsilon = 1e-08
|
||||
actor_network.adam_optimizer_beta1 = 0.9
|
||||
actor_network.adam_optimizer_beta2 = 0.999
|
||||
actor_network.input_embedders_parameters = {
|
||||
'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
|
||||
}
|
||||
actor_network.middleware_parameters = FCMiddlewareParameters(scheme=[Dense([256]), Dense([256]), Dense([256])])
|
||||
actor_network.heads_parameters[0].batchnorm = False
|
||||
|
||||
# critic
|
||||
critic_network = agent_params.network_wrappers['critic']
|
||||
critic_network.learning_rate = 0.001
|
||||
critic_network.batch_size = 256
|
||||
critic_network.optimizer_epsilon = 1e-08
|
||||
critic_network.adam_optimizer_beta1 = 0.9
|
||||
critic_network.adam_optimizer_beta2 = 0.999
|
||||
critic_network.input_embedders_parameters = {
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
|
||||
}
|
||||
critic_network.middleware_parameters = FCMiddlewareParameters(scheme=[Dense([256]), Dense([256]), Dense([256])])
|
||||
|
||||
agent_params.algorithm.discount = 0.98
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(1)
|
||||
agent_params.algorithm.num_consecutive_training_steps = 40
|
||||
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
|
||||
agent_params.algorithm.rate_for_copying_weights_to_target = 0.05
|
||||
agent_params.algorithm.action_penalty = 1
|
||||
agent_params.algorithm.use_non_zero_discount_for_terminal_states = True
|
||||
agent_params.algorithm.clip_critic_targets = [-50, 0]
|
||||
|
||||
# HER parameters
|
||||
agent_params.memory = EpisodicHindsightExperienceReplayParameters()
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6)
|
||||
agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
|
||||
agent_params.memory.hindsight_transitions_per_regular_transition = 4
|
||||
agent_params.memory.goals_space = GoalsSpace(goal_name='achieved_goal',
|
||||
reward_type=ReachingGoal(distance_from_goal_threshold=0.05,
|
||||
goal_reaching_reward=0,
|
||||
default_reward=-1),
|
||||
distance_metric=GoalsSpace.DistanceMetric.Euclidean)
|
||||
agent_params.memory.shared_memory = True
|
||||
|
||||
# exploration parameters
|
||||
agent_params.exploration = EGreedyParameters()
|
||||
agent_params.exploration.epsilon_schedule = ConstantSchedule(0.3)
|
||||
agent_params.exploration.evaluation_epsilon = 0
|
||||
# they actually take the noise_percentage_schedule to be 0.2 * max_abs_range which is 0.1 * total_range
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.noise_percentage_schedule = ConstantSchedule(0.1)
|
||||
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise_percentage = 0
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_observation_filter('observation', 'clipping', ObservationClippingFilter(-200, 200))
|
||||
|
||||
agent_params.pre_network_filter = MujocoInputFilter()
|
||||
agent_params.pre_network_filter.add_observation_filter('observation', 'normalize_observation',
|
||||
ObservationNormalizationFilter(name='normalize_observation'))
|
||||
agent_params.pre_network_filter.add_observation_filter('achieved_goal', 'normalize_achieved_goal',
|
||||
ObservationNormalizationFilter(name='normalize_achieved_goal'))
|
||||
agent_params.pre_network_filter.add_observation_filter('desired_goal', 'normalize_desired_goal',
|
||||
ObservationNormalizationFilter(name='normalize_desired_goal'))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(fetch_v1)
|
||||
env_params.custom_reward_threshold = -49
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 200
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 600
|
||||
# preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['slide', 'pick_and_place', 'push', 'reach']
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
47
rl_coach/presets/InvertedPendulum_PG.py
Normal file
47
rl_coach/presets/InvertedPendulum_PG.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from rl_coach.agents.policy_gradients_agent import PolicyGradientsAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco, MujocoInputFilter
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = PolicyGradientsAgentParameters()
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 5
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20000
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0005
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/20.))
|
||||
agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter())
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = "InvertedPendulum-v2"
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
|
||||
|
||||
44
rl_coach/presets/MontezumaRevenge_BC.py
Normal file
44
rl_coach/presets/MontezumaRevenge_BC.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from rl_coach.base_parameters import VisualizationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.environments.gym_environment import Atari
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
|
||||
from rl_coach.agents.bc_agent import BCAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = TrainingSteps(500)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = BCAgentParameters()
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00025
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
|
||||
# agent_params.memory.discount = 0.99
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(0)
|
||||
agent_params.memory.load_memory_from_file_path = 'datasets/montezuma_revenge.p'
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Atari()
|
||||
env_params.level = 'MontezumaRevenge-v0'
|
||||
env_params.random_initialization_steps = 30
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
62
rl_coach/presets/Mujoco_A3C.py
Normal file
62
rl_coach/presets/Mujoco_A3C.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter
|
||||
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(20000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 10000000
|
||||
agent_params.algorithm.beta_entropy = 0.0001
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00001
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/20.))
|
||||
agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter())
|
||||
|
||||
agent_params.exploration = ContinuousEntropyParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1000
|
||||
preset_validation_params.num_workers = 8
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
68
rl_coach/presets/Mujoco_A3C_LSTM.py
Normal file
68
rl_coach/presets/Mujoco_A3C_LSTM.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.architectures.tensorflow_components.middlewares.lstm_middleware import LSTMMiddlewareParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, InputEmbedderParameters, MiddlewareScheme, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter
|
||||
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20
|
||||
agent_params.algorithm.beta_entropy = 0.005
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00002
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'] = \
|
||||
InputEmbedderParameters(scheme=[Dense([200])])
|
||||
agent_params.network_wrappers['main'].middleware_parameters = LSTMMiddlewareParameters(scheme=MiddlewareScheme.Empty,
|
||||
number_of_lstm_cells=128)
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/20.))
|
||||
agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter())
|
||||
|
||||
agent_params.exploration = ContinuousEntropyParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1000
|
||||
preset_validation_params.num_workers = 8
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
78
rl_coach/presets/Mujoco_ClippedPPO.py
Normal file
78
rl_coach/presets/Mujoco_ClippedPPO.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from rl_coach.exploration_policies .additive_noise import AdditiveNoiseParameters
|
||||
|
||||
from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2048)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ClippedPPOAgentParameters()
|
||||
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0003
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'tanh'
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Dense([64])]
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([64])]
|
||||
agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'tanh'
|
||||
agent_params.network_wrappers['main'].batch_size = 64
|
||||
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
|
||||
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999
|
||||
|
||||
agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
|
||||
agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000)
|
||||
agent_params.algorithm.beta_entropy = 0
|
||||
agent_params.algorithm.gae_lambda = 0.95
|
||||
agent_params.algorithm.discount = 0.99
|
||||
agent_params.algorithm.optimization_epochs = 10
|
||||
agent_params.algorithm.estimate_state_value_using_gae = True
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.exploration = AdditiveNoiseParameters()
|
||||
agent_params.pre_network_filter = MujocoInputFilter()
|
||||
agent_params.pre_network_filter.add_observation_filter('observation', 'normalize_observation',
|
||||
ObservationNormalizationFilter(name='normalize_observation'))
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1000
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
53
rl_coach/presets/Mujoco_DDPG.py
Normal file
53
rl_coach/presets/Mujoco_DDPG.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentSteps(2000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDPGAgentParameters()
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense([400])]
|
||||
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([300])]
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense([400])]
|
||||
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])]
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 1000
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
56
rl_coach/presets/Mujoco_NAF.py
Normal file
56
rl_coach/presets/Mujoco_NAF.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.agents.naf_agent import NAFAgentParameters
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase, GradientClippingMethod
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(1000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = NAFAgentParameters()
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Dense([200])]
|
||||
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([200])]
|
||||
agent_params.network_wrappers['main'].clip_gradients = 1000
|
||||
agent_params.network_wrappers['main'].gradients_clipping_method = GradientClippingMethod.ClipByValue
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
|
||||
# this preset is currently broken - no test
|
||||
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
# preset_validation_params.test = True
|
||||
# preset_validation_params.min_reward_threshold = 200
|
||||
# preset_validation_params.max_episodes_to_achieve_reward = 600
|
||||
# preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
67
rl_coach/presets/Mujoco_PPO.py
Normal file
67
rl_coach/presets/Mujoco_PPO.py
Normal file
@@ -0,0 +1,67 @@
|
||||
from rl_coach.agents.ppo_agent import PPOAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
|
||||
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter
|
||||
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
|
||||
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = PPOAgentParameters()
|
||||
agent_params.network_wrappers['actor'].learning_rate = 0.001
|
||||
agent_params.network_wrappers['critic'].learning_rate = 0.001
|
||||
|
||||
agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense([64])]
|
||||
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([64])]
|
||||
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense([64])]
|
||||
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([64])]
|
||||
|
||||
agent_params.input_filter = MujocoInputFilter()
|
||||
agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter())
|
||||
|
||||
agent_params.exploration = ContinuousEntropyParameters()
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
env_params = Mujoco()
|
||||
env_params.level = SingleLevelSelection(mujoco_v2)
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
|
||||
|
||||
# this preset is currently broken
|
||||
|
||||
|
||||
########
|
||||
# Test #
|
||||
########
|
||||
preset_validation_params = PresetValidationParameters()
|
||||
preset_validation_params.test = True
|
||||
preset_validation_params.min_reward_threshold = 400
|
||||
preset_validation_params.max_episodes_to_achieve_reward = 3000
|
||||
preset_validation_params.reward_test_level = 'inverted_pendulum'
|
||||
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
preset_validation_params=preset_validation_params)
|
||||
|
||||
|
||||
148
rl_coach/presets/Pendulum_HAC.py
Normal file
148
rl_coach/presets/Pendulum_HAC.py
Normal file
@@ -0,0 +1,148 @@
|
||||
import numpy as np
|
||||
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
|
||||
from rl_coach.agents.hac_ddpg_agent import HACDDPGAgentParameters
|
||||
from rl_coach.architectures.tensorflow_components.architecture import Dense
|
||||
from rl_coach.base_parameters import VisualizationParameters, EmbeddingMergerType, EmbedderScheme, \
|
||||
InputEmbedderParameters
|
||||
from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.environments.gym_environment import Mujoco
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.graph_managers.hac_graph_manager import HACGraphManager
|
||||
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \
|
||||
EpisodicHindsightExperienceReplayParameters
|
||||
from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \
|
||||
EpisodicHRLHindsightExperienceReplayParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
|
||||
from rl_coach.schedules import ConstantSchedule
|
||||
from rl_coach.spaces import GoalsSpace, ReachingGoal
|
||||
|
||||
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps
|
||||
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
|
||||
from rl_coach.exploration_policies.ou_process import OUProcessParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64) # 40 epochs
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(4 * 64) # 4 small batches of 64 episodes
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(64)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
|
||||
polar_coordinates = False
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
|
||||
if polar_coordinates:
|
||||
distance_from_goal_threshold = np.array([0.075, 0.75])
|
||||
else:
|
||||
distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])
|
||||
goals_space = GoalsSpace('achieved_goal',
|
||||
ReachingGoal(default_reward=-1, goal_reaching_reward=0,
|
||||
distance_from_goal_threshold=distance_from_goal_threshold),
|
||||
lambda goal, state: np.abs(goal - state)) # raw L1 distance
|
||||
|
||||
# top agent
|
||||
top_agent_params = HACDDPGAgentParameters()
|
||||
|
||||
top_agent_params.memory = EpisodicHRLHindsightExperienceReplayParameters()
|
||||
top_agent_params.memory.max_size = (MemoryGranularity.Transitions, 10000000)
|
||||
top_agent_params.memory.hindsight_transitions_per_regular_transition = 3
|
||||
top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
|
||||
top_agent_params.memory.goals_space = goals_space
|
||||
top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32)
|
||||
top_agent_params.algorithm.num_consecutive_training_steps = 40
|
||||
top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
|
||||
|
||||
# exploration - OU process
|
||||
top_agent_params.exploration = OUProcessParameters()
|
||||
top_agent_params.exploration.theta = 0.1
|
||||
|
||||
# actor
|
||||
top_actor = top_agent_params.network_wrappers['actor']
|
||||
top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
|
||||
top_actor.middleware_parameters.scheme = [Dense([64])] * 3
|
||||
top_actor.learning_rate = 0.001
|
||||
top_actor.batch_size = 4096
|
||||
|
||||
# critic
|
||||
top_critic = top_agent_params.network_wrappers['critic']
|
||||
top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
|
||||
top_critic.embedding_merger_type = EmbeddingMergerType.Concat
|
||||
top_critic.middleware_parameters.scheme = [Dense([64])] * 3
|
||||
top_critic.learning_rate = 0.001
|
||||
top_critic.batch_size = 4096
|
||||
|
||||
# ----------
|
||||
|
||||
# bottom agent
|
||||
bottom_agent_params = HACDDPGAgentParameters()
|
||||
|
||||
# TODO: we should do this is a cleaner way. probably HACGraphManager, should set this for all non top-level agents
|
||||
bottom_agent_params.algorithm.in_action_space = goals_space
|
||||
|
||||
bottom_agent_params.memory = EpisodicHindsightExperienceReplayParameters()
|
||||
bottom_agent_params.memory.max_size = (MemoryGranularity.Transitions, 12000000)
|
||||
bottom_agent_params.memory.hindsight_transitions_per_regular_transition = 4
|
||||
bottom_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future
|
||||
bottom_agent_params.memory.goals_space = goals_space
|
||||
bottom_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16 * 25) # 25 episodes is one true env episode
|
||||
bottom_agent_params.algorithm.num_consecutive_training_steps = 40
|
||||
bottom_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)
|
||||
|
||||
bottom_agent_params.exploration = EGreedyParameters()
|
||||
bottom_agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)
|
||||
bottom_agent_params.exploration.evaluation_epsilon = 0
|
||||
bottom_agent_params.exploration.continuous_exploration_policy_parameters = OUProcessParameters()
|
||||
bottom_agent_params.exploration.continuous_exploration_policy_parameters.theta = 0.1
|
||||
|
||||
# actor
|
||||
bottom_actor = bottom_agent_params.network_wrappers['actor']
|
||||
bottom_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
|
||||
bottom_actor.middleware_parameters.scheme = [Dense([64])] * 3
|
||||
bottom_actor.learning_rate = 0.001
|
||||
bottom_actor.batch_size = 4096
|
||||
|
||||
# critic
|
||||
bottom_critic = bottom_agent_params.network_wrappers['critic']
|
||||
bottom_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
|
||||
'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
|
||||
bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat
|
||||
bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3
|
||||
bottom_critic.learning_rate = 0.001
|
||||
bottom_critic.batch_size = 4096
|
||||
|
||||
agents_params = [top_agent_params, bottom_agent_params]
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
time_limit = 1000
|
||||
|
||||
env_params = Mujoco()
|
||||
env_params.level = "rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals"
|
||||
env_params.additional_simulator_parameters = {"time_limit": time_limit,
|
||||
"random_goals_instead_of_standing_goal": False,
|
||||
"polar_coordinates": polar_coordinates,
|
||||
"goal_reaching_thresholds": distance_from_goal_threshold}
|
||||
env_params.frame_skip = 10
|
||||
env_params.custom_reward_threshold = -time_limit + 1
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]
|
||||
vis_params.dump_mp4 = False
|
||||
vis_params.native_rendering = False
|
||||
|
||||
graph_manager = HACGraphManager(agents_params=agents_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params,
|
||||
consecutive_steps_to_run_non_top_levels=EnvironmentSteps(40))
|
||||
65
rl_coach/presets/Starcraft_CollectMinerals_A3C.py
Normal file
65
rl_coach/presets/Starcraft_CollectMinerals_A3C.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from rl_coach.agents.actor_critic_agent import ActorCriticAgentParameters
|
||||
from rl_coach.agents.policy_optimization_agent import PolicyGradientRescaler
|
||||
from rl_coach.base_parameters import VisualizationParameters, InputEmbedderParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, AlwaysDumpMethod
|
||||
from rl_coach.environments.starcraft2_environment import StarCraft2EnvironmentParameters
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.schedules import LinearSchedule, ConstantSchedule
|
||||
|
||||
from rl_coach.core_types import RunPhase
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(0)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = ActorCriticAgentParameters()
|
||||
|
||||
agent_params.algorithm.policy_gradient_rescaler = PolicyGradientRescaler.GAE
|
||||
agent_params.algorithm.apply_gradients_every_x_episodes = 1
|
||||
agent_params.algorithm.num_steps_between_gradient_updates = 20
|
||||
agent_params.algorithm.gae_lambda = 0.96
|
||||
agent_params.algorithm.beta_entropy = 0
|
||||
|
||||
agent_params.network_wrappers['main'].clip_gradients = 10.0
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.00001
|
||||
# agent_params.network_wrappers['main'].batch_size = 20
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters = {
|
||||
"screen": InputEmbedderParameters(input_rescaling={'image': 3.0})
|
||||
}
|
||||
|
||||
agent_params.exploration = AdditiveNoiseParameters()
|
||||
agent_params.exploration.noise_percentage_schedule = ConstantSchedule(0.05)
|
||||
# agent_params.exploration.noise_percentage_schedule = LinearSchedule(0.4, 0.05, 100000)
|
||||
agent_params.exploration.evaluation_noise_percentage = 0.05
|
||||
|
||||
agent_params.network_wrappers['main'].batch_size = 64
|
||||
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
|
||||
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
|
||||
env_params = StarCraft2EnvironmentParameters()
|
||||
env_params.level = 'CollectMineralShards'
|
||||
env_params.feature_screen_maps_to_use = [5]
|
||||
env_params.feature_minimap_maps_to_use = [5]
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
# vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST),MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = True
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
65
rl_coach/presets/Starcraft_CollectMinerals_Dueling_DDQN.py
Normal file
65
rl_coach/presets/Starcraft_CollectMinerals_Dueling_DDQN.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
from rl_coach.architectures.tensorflow_components.heads.dueling_q_head import DuelingQHeadParameters
|
||||
from rl_coach.base_parameters import VisualizationParameters, InputEmbedderParameters
|
||||
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod
|
||||
from rl_coach.environments.starcraft2_environment import StarCraft2EnvironmentParameters
|
||||
from rl_coach.filters.action.box_discretization import BoxDiscretization
|
||||
from rl_coach.filters.filter import OutputFilter
|
||||
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
|
||||
from rl_coach.graph_managers.graph_manager import ScheduleParameters
|
||||
from rl_coach.memories.memory import MemoryGranularity
|
||||
from rl_coach.schedules import LinearSchedule
|
||||
|
||||
from rl_coach.agents.ddqn_agent import DDQNAgentParameters
|
||||
from rl_coach.core_types import RunPhase
|
||||
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
|
||||
|
||||
####################
|
||||
# Graph Scheduling #
|
||||
####################
|
||||
schedule_params = ScheduleParameters()
|
||||
schedule_params.improve_steps = TrainingSteps(10000000000)
|
||||
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50)
|
||||
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
|
||||
schedule_params.heatup_steps = EnvironmentSteps(50000)
|
||||
|
||||
#########
|
||||
# Agent #
|
||||
#########
|
||||
agent_params = DDQNAgentParameters()
|
||||
|
||||
agent_params.network_wrappers['main'].learning_rate = 0.0001
|
||||
agent_params.network_wrappers['main'].input_embedders_parameters = {
|
||||
"screen": InputEmbedderParameters(input_rescaling={'image': 3.0})
|
||||
}
|
||||
agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()]
|
||||
agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000)
|
||||
# slave_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(10000)
|
||||
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.1, 1000000)
|
||||
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4)
|
||||
agent_params.output_filter = \
|
||||
OutputFilter(
|
||||
action_filters=OrderedDict([
|
||||
('discretization', BoxDiscretization(num_bins_per_dimension=4, force_int_bins=True))
|
||||
]),
|
||||
is_a_reference_filter=False
|
||||
)
|
||||
|
||||
|
||||
###############
|
||||
# Environment #
|
||||
###############
|
||||
|
||||
env_params = StarCraft2EnvironmentParameters()
|
||||
env_params.level = 'CollectMineralShards'
|
||||
env_params.feature_screen_maps_to_use = [5]
|
||||
env_params.feature_minimap_maps_to_use = [5]
|
||||
|
||||
vis_params = VisualizationParameters()
|
||||
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
|
||||
vis_params.dump_mp4 = False
|
||||
# vis_params.dump_in_episode_signals = True
|
||||
|
||||
graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
|
||||
schedule_params=schedule_params, vis_params=vis_params)
|
||||
0
rl_coach/presets/__init__.py
Normal file
0
rl_coach/presets/__init__.py
Normal file
Reference in New Issue
Block a user