mirror of
https://github.com/gryf/coach.git
synced 2025-12-18 11:40:18 +01:00
coach v0.8.0
This commit is contained in:
532
configurations.py
Normal file
532
configurations.py
Normal file
@@ -0,0 +1,532 @@
|
||||
#
|
||||
# Copyright (c) 2017 Intel Corporation
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from utils import Enum
|
||||
import json
|
||||
from logger import screen, logger
|
||||
|
||||
|
||||
class Frameworks(Enum):
|
||||
TensorFlow = 1
|
||||
Neon = 2
|
||||
|
||||
|
||||
class InputTypes:
|
||||
Observation = 1
|
||||
Measurements = 2
|
||||
GoalVector = 3
|
||||
Action = 4
|
||||
TimedObservation = 5
|
||||
|
||||
|
||||
class OutputTypes:
|
||||
Q = 1
|
||||
DuelingQ = 2
|
||||
V = 3
|
||||
Pi = 4
|
||||
MeasurementsPrediction = 5
|
||||
DNDQ = 6
|
||||
NAF = 7
|
||||
PPO = 8
|
||||
PPO_V = 9
|
||||
DistributionalQ = 10
|
||||
|
||||
|
||||
class MiddlewareTypes:
|
||||
LSTM = 1
|
||||
FC = 2
|
||||
|
||||
|
||||
class AgentParameters:
|
||||
agent = ''
|
||||
|
||||
# Architecture parameters
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Q]
|
||||
middleware_type = MiddlewareTypes.FC
|
||||
loss_weights = [1.0]
|
||||
stop_gradients_from_head = [False]
|
||||
num_output_head_copies = 1
|
||||
use_measurements = False
|
||||
use_accumulated_reward_as_measurement = False
|
||||
add_a_normalized_timestep_to_the_observation = False
|
||||
l2_regularization = 0
|
||||
hidden_layers_activation_function = 'relu'
|
||||
optimizer_type = 'Adam'
|
||||
async_training = False
|
||||
use_separate_networks_per_head = False
|
||||
|
||||
# Agent parameters
|
||||
num_consecutive_playing_steps = 1
|
||||
num_consecutive_training_steps = 1
|
||||
bootstrap_total_return_from_old_policy = False
|
||||
n_step = -1
|
||||
num_episodes_in_experience_replay = 200
|
||||
num_transitions_in_experience_replay = None
|
||||
discount = 0.99
|
||||
policy_gradient_rescaler = 'A_VALUE'
|
||||
apply_gradients_every_x_episodes = 5
|
||||
beta_entropy = 0
|
||||
num_steps_between_gradient_updates = 20000 # t_max
|
||||
num_steps_between_copying_online_weights_to_target = 1000
|
||||
rate_for_copying_weights_to_target = 1.0
|
||||
monte_carlo_mixing_rate = 0.1
|
||||
gae_lambda = 0.96
|
||||
step_until_collecting_full_episodes = False
|
||||
targets_horizon = 'N-Step'
|
||||
replace_mse_with_huber_loss = False
|
||||
|
||||
# PPO related params
|
||||
target_kl_divergence = 0.01
|
||||
initial_kl_coefficient = 1.0
|
||||
high_kl_penalty_coefficient = 1000
|
||||
value_targets_mix_fraction = 0.1
|
||||
clip_likelihood_ratio_using_epsilon = None
|
||||
use_kl_regularization = True
|
||||
estimate_value_using_gae = False
|
||||
|
||||
# DFP related params
|
||||
num_predicted_steps_ahead = 6
|
||||
goal_vector = [1.0, 1.0]
|
||||
future_measurements_weights = [0.5, 0.5, 1.0]
|
||||
|
||||
# NEC related params
|
||||
dnd_size = 500000
|
||||
l2_norm_added_delta = 0.001
|
||||
new_value_shift_coefficient = 0.1
|
||||
number_of_knn = 50
|
||||
DND_key_error_threshold = 0.01
|
||||
|
||||
# Framework support
|
||||
neon_support = False
|
||||
tensorflow_support = True
|
||||
|
||||
# distributed agents params
|
||||
shared_optimizer = True
|
||||
share_statistics_between_workers = True
|
||||
|
||||
|
||||
class EnvironmentParameters:
|
||||
type = 'Doom'
|
||||
level = 'basic'
|
||||
observation_stack_size = 4
|
||||
frame_skip = 4
|
||||
desired_observation_width = 76
|
||||
desired_observation_height = 60
|
||||
normalize_observation = False
|
||||
reward_scaling = 1.0
|
||||
reward_clipping_min = None
|
||||
reward_clipping_max = None
|
||||
|
||||
|
||||
class ExplorationParameters:
|
||||
# Exploration policies
|
||||
policy = 'EGreedy'
|
||||
evaluation_policy = 'Greedy'
|
||||
# -- bootstrap dqn parameters
|
||||
bootstrapped_data_sharing_probability = 0.5
|
||||
architecture_num_q_heads = 1
|
||||
# -- dropout approximation of thompson sampling parameters
|
||||
dropout_discard_probability = 0
|
||||
initial_keep_probability = 0.0 # unused
|
||||
final_keep_probability = 0.99 # unused
|
||||
keep_probability_decay_steps = 50000 # unused
|
||||
# -- epsilon greedy parameters
|
||||
initial_epsilon = 0.5
|
||||
final_epsilon = 0.01
|
||||
epsilon_decay_steps = 50000
|
||||
evaluation_epsilon = 0.05
|
||||
# -- epsilon greedy at end of episode parameters
|
||||
average_episode_length_over_num_episodes = 20
|
||||
# -- boltzmann softmax parameters
|
||||
initial_temperature = 100.0
|
||||
final_temperature = 1.0
|
||||
temperature_decay_steps = 50000
|
||||
# -- additive noise
|
||||
initial_noise_variance_percentage = 0.1
|
||||
final_noise_variance_percentage = 0.1
|
||||
noise_variance_decay_steps = 1
|
||||
# -- Ornstein-Uhlenbeck process
|
||||
mu = 0
|
||||
theta = 0.15
|
||||
sigma = 0.3
|
||||
dt = 0.01
|
||||
|
||||
|
||||
class GeneralParameters:
|
||||
train = True
|
||||
framework = Frameworks.TensorFlow
|
||||
threads = 1
|
||||
sess = None
|
||||
|
||||
# distributed training options
|
||||
num_threads = 1
|
||||
synchronize_over_num_threads = 1
|
||||
distributed = False
|
||||
|
||||
# Agent blocks
|
||||
memory = 'EpisodicExperienceReplay'
|
||||
architecture = 'GeneralTensorFlowNetwork'
|
||||
|
||||
# General parameters
|
||||
clip_gradients = None
|
||||
kl_divergence_constraint = 100000
|
||||
num_training_iterations = 10000000000
|
||||
num_heatup_steps = 1000
|
||||
batch_size = 32
|
||||
save_model_sec = None
|
||||
save_model_dir = None
|
||||
checkpoint_restore_dir = None
|
||||
learning_rate = 0.00025
|
||||
learning_rate_decay_rate = 0
|
||||
learning_rate_decay_steps = 0
|
||||
evaluation_episodes = 5
|
||||
evaluate_every_x_episodes = 1000000
|
||||
rescaling_interpolation_type = 'bilinear'
|
||||
|
||||
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
|
||||
# the form of different workers starting at different times, and getting different assignments of CPU
|
||||
# time from the OS.
|
||||
seed = None
|
||||
|
||||
checkpoints_path = ''
|
||||
|
||||
# Testing parameters
|
||||
test = False
|
||||
test_min_return_threshold = 0
|
||||
test_max_step_threshold = 1
|
||||
test_num_workers = 1
|
||||
|
||||
|
||||
class VisualizationParameters:
|
||||
# Visualization parameters
|
||||
record_video_every = 1000
|
||||
video_path = '/home/llt_lab/temp/breakout-videos'
|
||||
plot_action_values_online = False
|
||||
show_saliency_maps_every_num_episodes = 1000000000
|
||||
print_summary = False
|
||||
dump_csv = True
|
||||
dump_signals_to_csv_every_x_episodes = 10
|
||||
render = False
|
||||
dump_gifs = True
|
||||
|
||||
|
||||
class Roboschool(EnvironmentParameters):
|
||||
type = 'Gym'
|
||||
frame_skip = 1
|
||||
observation_stack_size = 1
|
||||
desired_observation_height = None
|
||||
desired_observation_width = None
|
||||
|
||||
|
||||
class GymVectorObservation(EnvironmentParameters):
|
||||
type = 'Gym'
|
||||
frame_skip = 1
|
||||
observation_stack_size = 1
|
||||
desired_observation_height = None
|
||||
desired_observation_width = None
|
||||
|
||||
|
||||
class Bullet(EnvironmentParameters):
|
||||
type = 'Bullet'
|
||||
frame_skip = 1
|
||||
observation_stack_size = 1
|
||||
desired_observation_height = None
|
||||
desired_observation_width = None
|
||||
|
||||
|
||||
class Atari(EnvironmentParameters):
|
||||
type = 'Gym'
|
||||
frame_skip = 1
|
||||
observation_stack_size = 4
|
||||
desired_observation_height = 84
|
||||
desired_observation_width = 84
|
||||
reward_clipping_max = 1.0
|
||||
reward_clipping_min = -1.0
|
||||
|
||||
|
||||
class Doom(EnvironmentParameters):
|
||||
type = 'Doom'
|
||||
frame_skip = 4
|
||||
observation_stack_size = 3
|
||||
desired_observation_height = 60
|
||||
desired_observation_width = 76
|
||||
|
||||
|
||||
class NStepQ(AgentParameters):
|
||||
type = 'NStepQAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Q]
|
||||
loss_weights = [1.0]
|
||||
optimizer_type = 'Adam'
|
||||
num_steps_between_copying_online_weights_to_target = 1000
|
||||
num_episodes_in_experience_replay = 2
|
||||
apply_gradients_every_x_episodes = 1
|
||||
num_steps_between_gradient_updates = 20 # this is called t_max in all the papers
|
||||
hidden_layers_activation_function = 'elu'
|
||||
targets_horizon = 'N-Step'
|
||||
async_training = True
|
||||
shared_optimizer = True
|
||||
|
||||
|
||||
class DQN(AgentParameters):
|
||||
type = 'DQNAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Q]
|
||||
loss_weights = [1.0]
|
||||
optimizer_type = 'Adam'
|
||||
num_steps_between_copying_online_weights_to_target = 1000
|
||||
neon_support = True
|
||||
async_training = True
|
||||
shared_optimizer = True
|
||||
|
||||
|
||||
class DDQN(DQN):
|
||||
type = 'DDQNAgent'
|
||||
|
||||
class DuelingDQN(DQN):
|
||||
type = 'DQNAgent'
|
||||
output_types = [OutputTypes.DuelingQ]
|
||||
|
||||
class BootstrappedDQN(DQN):
|
||||
type = 'BootstrappedDQNAgent'
|
||||
num_output_head_copies = 10
|
||||
|
||||
|
||||
class DistributionalDQN(DQN):
|
||||
type = 'DistributionalDQNAgent'
|
||||
output_types = [OutputTypes.DistributionalQ]
|
||||
v_min = -10.0
|
||||
v_max = 10.0
|
||||
atoms = 51
|
||||
|
||||
|
||||
class NEC(AgentParameters):
|
||||
type = 'NECAgent'
|
||||
optimizer_type = 'RMSProp'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.DNDQ]
|
||||
loss_weights = [1.0]
|
||||
dnd_size = 500000
|
||||
l2_norm_added_delta = 0.001
|
||||
new_value_shift_coefficient = 0.1
|
||||
number_of_knn = 50
|
||||
n_step = 100
|
||||
bootstrap_total_return_from_old_policy = True
|
||||
DND_key_error_threshold = 0.1
|
||||
|
||||
|
||||
class ActorCritic(AgentParameters):
|
||||
type = 'ActorCriticAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.V, OutputTypes.Pi]
|
||||
loss_weights = [0.5, 1.0]
|
||||
stop_gradients_from_head = [False, False]
|
||||
num_episodes_in_experience_replay = 2
|
||||
policy_gradient_rescaler = 'A_VALUE'
|
||||
hidden_layers_activation_function = 'elu'
|
||||
apply_gradients_every_x_episodes = 5
|
||||
beta_entropy = 0
|
||||
num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
|
||||
gae_lambda = 0.96
|
||||
shared_optimizer = True
|
||||
estimate_value_using_gae = False
|
||||
async_training = True
|
||||
|
||||
|
||||
class PolicyGradient(AgentParameters):
|
||||
type = 'PolicyGradientsAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Pi]
|
||||
loss_weights = [1.0]
|
||||
num_episodes_in_experience_replay = 2
|
||||
policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP'
|
||||
apply_gradients_every_x_episodes = 5
|
||||
beta_entropy = 0
|
||||
num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
|
||||
async_training = True
|
||||
|
||||
|
||||
class DDPG(AgentParameters):
|
||||
type = 'DDPGAgent'
|
||||
input_types = [InputTypes.Observation, InputTypes.Action]
|
||||
output_types = [OutputTypes.V] # V is used because we only want a single Q value
|
||||
loss_weights = [1.0]
|
||||
hidden_layers_activation_function = 'relu'
|
||||
num_episodes_in_experience_replay = 10000
|
||||
num_steps_between_copying_online_weights_to_target = 1
|
||||
rate_for_copying_weights_to_target = 0.001
|
||||
shared_optimizer = True
|
||||
async_training = True
|
||||
|
||||
|
||||
class DDDPG(AgentParameters):
|
||||
type = 'DDPGAgent'
|
||||
input_types = [InputTypes.Observation, InputTypes.Action]
|
||||
output_types = [OutputTypes.V] # V is used because we only want a single Q value
|
||||
loss_weights = [1.0]
|
||||
hidden_layers_activation_function = 'relu'
|
||||
num_episodes_in_experience_replay = 10000
|
||||
num_steps_between_copying_online_weights_to_target = 10
|
||||
rate_for_copying_weights_to_target = 1
|
||||
shared_optimizer = True
|
||||
async_training = True
|
||||
|
||||
|
||||
class NAF(AgentParameters):
|
||||
type = 'NAFAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.NAF]
|
||||
loss_weights = [1.0]
|
||||
hidden_layers_activation_function = 'tanh'
|
||||
num_consecutive_training_steps = 5
|
||||
num_steps_between_copying_online_weights_to_target = 1
|
||||
rate_for_copying_weights_to_target = 0.001
|
||||
optimizer_type = 'RMSProp'
|
||||
async_training = True
|
||||
|
||||
|
||||
class PPO(AgentParameters):
|
||||
type = 'PPOAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.V]
|
||||
loss_weights = [1.0]
|
||||
hidden_layers_activation_function = 'tanh'
|
||||
num_episodes_in_experience_replay = 1000000
|
||||
policy_gradient_rescaler = 'A_VALUE'
|
||||
gae_lambda = 0.96
|
||||
target_kl_divergence = 0.01
|
||||
initial_kl_coefficient = 1.0
|
||||
high_kl_penalty_coefficient = 1000
|
||||
add_a_normalized_timestep_to_the_observation = True
|
||||
l2_regularization = 0#1e-3
|
||||
value_targets_mix_fraction = 0.1
|
||||
async_training = True
|
||||
estimate_value_using_gae = True
|
||||
step_until_collecting_full_episodes = True
|
||||
|
||||
|
||||
class ClippedPPO(AgentParameters):
|
||||
type = 'ClippedPPOAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.V, OutputTypes.PPO]
|
||||
loss_weights = [0.5, 1.0]
|
||||
stop_gradients_from_head = [False, False]
|
||||
hidden_layers_activation_function = 'tanh'
|
||||
num_episodes_in_experience_replay = 1000000
|
||||
policy_gradient_rescaler = 'GAE'
|
||||
gae_lambda = 0.95
|
||||
target_kl_divergence = 0.01
|
||||
initial_kl_coefficient = 1.0
|
||||
high_kl_penalty_coefficient = 1000
|
||||
add_a_normalized_timestep_to_the_observation = False
|
||||
l2_regularization = 1e-3
|
||||
value_targets_mix_fraction = 0.1
|
||||
clip_likelihood_ratio_using_epsilon = 0.2
|
||||
async_training = False
|
||||
use_kl_regularization = False
|
||||
estimate_value_using_gae = True
|
||||
batch_size = 64
|
||||
use_separate_networks_per_head = True
|
||||
step_until_collecting_full_episodes = True
|
||||
beta_entropy = 0.01
|
||||
|
||||
class DFP(AgentParameters):
|
||||
type = 'DFPAgent'
|
||||
input_types = [InputTypes.Observation, InputTypes.Measurements, InputTypes.GoalVector]
|
||||
output_types = [OutputTypes.MeasurementsPrediction]
|
||||
loss_weights = [1.0]
|
||||
use_measurements = True
|
||||
num_predicted_steps_ahead = 6
|
||||
goal_vector = [1.0, 1.0]
|
||||
future_measurements_weights = [0.5, 0.5, 1.0]
|
||||
async_training = True
|
||||
|
||||
|
||||
class MMC(AgentParameters):
|
||||
type = 'MixedMonteCarloAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Q]
|
||||
loss_weights = [1.0]
|
||||
num_steps_between_copying_online_weights_to_target = 1000
|
||||
monte_carlo_mixing_rate = 0.1
|
||||
neon_support = True
|
||||
|
||||
|
||||
class PAL(AgentParameters):
|
||||
type = 'PALAgent'
|
||||
input_types = [InputTypes.Observation]
|
||||
output_types = [OutputTypes.Q]
|
||||
loss_weights = [1.0]
|
||||
pal_alpha = 0.9
|
||||
persistent_advantage_learning = False
|
||||
num_steps_between_copying_online_weights_to_target = 1000
|
||||
neon_support = True
|
||||
|
||||
|
||||
class EGreedyExploration(ExplorationParameters):
|
||||
policy = 'EGreedy'
|
||||
initial_epsilon = 0.5
|
||||
final_epsilon = 0.01
|
||||
epsilon_decay_steps = 50000
|
||||
evaluation_epsilon = 0.05
|
||||
initial_noise_variance_percentage = 0.1
|
||||
final_noise_variance_percentage = 0.1
|
||||
noise_variance_decay_steps = 50000
|
||||
|
||||
|
||||
class BootstrappedDQNExploration(ExplorationParameters):
|
||||
policy = 'Bootstrapped'
|
||||
architecture_num_q_heads = 10
|
||||
bootstrapped_data_sharing_probability = 0.1
|
||||
|
||||
|
||||
class OUExploration(ExplorationParameters):
|
||||
policy = 'OUProcess'
|
||||
mu = 0
|
||||
theta = 0.15
|
||||
sigma = 0.3
|
||||
dt = 0.01
|
||||
|
||||
|
||||
class AdditiveNoiseExploration(ExplorationParameters):
|
||||
policy = 'AdditiveNoise'
|
||||
initial_noise_variance_percentage = 0.1
|
||||
final_noise_variance_percentage = 0.1
|
||||
noise_variance_decay_steps = 50000
|
||||
|
||||
|
||||
class EntropyExploration(ExplorationParameters):
|
||||
policy = 'ContinuousEntropy'
|
||||
|
||||
|
||||
class CategoricalExploration(ExplorationParameters):
|
||||
policy = 'Categorical'
|
||||
|
||||
|
||||
class Preset(GeneralParameters):
|
||||
def __init__(self, agent, env, exploration, visualization=VisualizationParameters):
|
||||
"""
|
||||
:type agent: AgentParameters
|
||||
:type env: EnvironmentParameters
|
||||
:type exploration: ExplorationParameters
|
||||
:type visualization: VisualizationParameters
|
||||
"""
|
||||
self.visualization = visualization
|
||||
self.agent = agent
|
||||
self.env = env
|
||||
self.exploration = exploration
|
||||
|
||||
Reference in New Issue
Block a user