1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 19:20:19 +01:00
Files
coach/configurations.py
Roman Dobosz 1b095aeeca Cleanup imports.
Till now, most of the modules were importing all of the module objects
(variables, classes, functions, other imports) into module namespace,
which potentially could (and was) cause of unintentional use of class or
methods, which was indirect imported.

With this patch, all the star imports were substituted with top-level
module, which provides desired class or function.

Besides, all imports where sorted (where possible) in a way pep8[1]
suggests - first are imports from standard library, than goes third
party imports (like numpy, tensorflow etc) and finally coach modules.
All of those sections are separated by one empty line.

[1] https://www.python.org/dev/peps/pep-0008/#imports
2018-04-13 09:58:40 +02:00

622 lines
18 KiB
Python

#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import types
import utils
class Frameworks(utils.Enum):
TensorFlow = 1
Neon = 2
class InputTypes(object):
Observation = 1
Measurements = 2
GoalVector = 3
Action = 4
TimedObservation = 5
class EmbedderComplexity(object):
Shallow = 1
Deep = 2
class OutputTypes(object):
Q = 1
DuelingQ = 2
V = 3
Pi = 4
MeasurementsPrediction = 5
DNDQ = 6
NAF = 7
PPO = 8
PPO_V = 9
CategoricalQ = 10
QuantileRegressionQ = 11
class MiddlewareTypes(object):
LSTM = 1
FC = 2
class Parameters(object):
def __str__(self):
parameters = {}
for k, v in self.__dict__.items():
if isinstance(v, type) and issubclass(v, Parameters):
# v.__dict__ doesn't return a dictionary but a mappingproxy
# which json doesn't serialize, so convert it into a normal
# dictionary
parameters[k] = dict(v.__dict__.items())
elif isinstance(v, types.MappingProxyType):
parameters[k] = dict(v.items())
else:
parameters[k] = v
return json.dumps(parameters, indent=4, default=repr)
class AgentParameters(Parameters):
agent = ''
# Architecture parameters
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
middleware_type = MiddlewareTypes.FC
loss_weights = [1.0]
stop_gradients_from_head = [False]
embedder_complexity = EmbedderComplexity.Shallow
num_output_head_copies = 1
use_measurements = False
use_accumulated_reward_as_measurement = False
add_a_normalized_timestep_to_the_observation = False
l2_regularization = 0
hidden_layers_activation_function = 'relu'
optimizer_type = 'Adam'
async_training = False
use_separate_networks_per_head = False
# Agent parameters
num_consecutive_playing_steps = 1
num_consecutive_training_steps = 1
update_evaluation_agent_network_after_every_num_steps = 3000
bootstrap_total_return_from_old_policy = False
n_step = -1
num_episodes_in_experience_replay = 200
num_transitions_in_experience_replay = None
discount = 0.99
policy_gradient_rescaler = 'A_VALUE'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 20000 # t_max
num_steps_between_copying_online_weights_to_target = 1000
rate_for_copying_weights_to_target = 1.0
monte_carlo_mixing_rate = 0.1
gae_lambda = 0.96
step_until_collecting_full_episodes = False
targets_horizon = 'N-Step'
replace_mse_with_huber_loss = False
load_memory_from_file_path = None
collect_new_data = True
input_rescaler = 255.0
# PPO related params
target_kl_divergence = 0.01
initial_kl_coefficient = 1.0
high_kl_penalty_coefficient = 1000
value_targets_mix_fraction = 0.1
clip_likelihood_ratio_using_epsilon = None
use_kl_regularization = True
estimate_value_using_gae = False
# DFP related params
num_predicted_steps_ahead = 6
goal_vector = [1.0, 1.0]
future_measurements_weights = [0.5, 0.5, 1.0]
# NEC related params
dnd_size = 500000
l2_norm_added_delta = 0.001
new_value_shift_coefficient = 0.1
number_of_knn = 50
DND_key_error_threshold = 0.01
# Framework support
neon_support = False
tensorflow_support = True
# distributed agents params
shared_optimizer = True
share_statistics_between_workers = True
class EnvironmentParameters(Parameters):
type = 'Doom'
level = 'basic'
observation_stack_size = 4
frame_skip = 4
desired_observation_width = 76
desired_observation_height = 60
normalize_observation = False
crop_observation = False
random_initialization_steps = 0
reward_scaling = 1.0
reward_clipping_min = None
reward_clipping_max = None
human_control = False
class ExplorationParameters(Parameters):
# Exploration policies
policy = 'EGreedy'
evaluation_policy = 'Greedy'
# -- bootstrap dqn parameters
bootstrapped_data_sharing_probability = 0.5
architecture_num_q_heads = 1
# -- dropout approximation of thompson sampling parameters
dropout_discard_probability = 0
initial_keep_probability = 0.0 # unused
final_keep_probability = 0.99 # unused
keep_probability_decay_steps = 50000 # unused
# -- epsilon greedy parameters
initial_epsilon = 0.5
final_epsilon = 0.01
epsilon_decay_steps = 50000
evaluation_epsilon = 0.05
# -- epsilon greedy at end of episode parameters
average_episode_length_over_num_episodes = 20
# -- boltzmann softmax parameters
initial_temperature = 100.0
final_temperature = 1.0
temperature_decay_steps = 50000
# -- additive noise
initial_noise_variance_percentage = 0.1
final_noise_variance_percentage = 0.1
noise_variance_decay_steps = 1
# -- Ornstein-Uhlenbeck process
mu = 0
theta = 0.15
sigma = 0.3
dt = 0.01
class GeneralParameters(Parameters):
train = True
framework = Frameworks.TensorFlow
threads = 1
sess = None
# distributed training options
num_threads = 1
synchronize_over_num_threads = 1
distributed = False
# Agent blocks
memory = 'EpisodicExperienceReplay'
architecture = 'GeneralTensorFlowNetwork'
# General parameters
clip_gradients = None
kl_divergence_constraint = 100000
num_training_iterations = 10000000000
num_heatup_steps = 1000
heatup_using_network_decisions = False
batch_size = 32
save_model_sec = None
save_model_dir = None
checkpoint_restore_dir = None
learning_rate = 0.00025
learning_rate_decay_rate = 0
learning_rate_decay_steps = 0
evaluation_episodes = 5
evaluate_every_x_episodes = 1000000
evaluate_every_x_training_iterations = 0
rescaling_interpolation_type = 'bilinear'
current_episode = 0
# setting a seed will only work for non-parallel algorithms. Parallel algorithms add uncontrollable noise in
# the form of different workers starting at different times, and getting different assignments of CPU
# time from the OS.
seed = None
checkpoints_path = ''
# Testing parameters
test = False
test_min_return_threshold = 0
test_max_step_threshold = 1
test_num_workers = 1
class VisualizationParameters(Parameters):
# Visualization parameters
record_video_every = 1000
video_path = '/home/llt_lab/temp/breakout-videos'
plot_action_values_online = False
show_saliency_maps_every_num_episodes = 1000000000
render_observation = False
print_summary = False
dump_csv = True
dump_signals_to_csv_every_x_episodes = 5
render = False
dump_gifs = True
max_fps_for_human_control = 10
tensorboard = False
class Roboschool(EnvironmentParameters):
type = 'Gym'
frame_skip = 1
observation_stack_size = 1
desired_observation_height = None
desired_observation_width = None
class GymVectorObservation(EnvironmentParameters):
type = 'Gym'
frame_skip = 1
observation_stack_size = 1
desired_observation_height = None
desired_observation_width = None
class Bullet(EnvironmentParameters):
type = 'Bullet'
frame_skip = 1
observation_stack_size = 1
desired_observation_height = None
desired_observation_width = None
class Atari(EnvironmentParameters):
type = 'Gym'
frame_skip = 4
observation_stack_size = 4
desired_observation_height = 84
desired_observation_width = 84
reward_clipping_max = 1.0
reward_clipping_min = -1.0
random_initialization_steps = 30
crop_observation = False # in the original paper the observation is cropped but not in the Nature paper
class Doom(EnvironmentParameters):
type = 'Doom'
frame_skip = 4
observation_stack_size = 3
desired_observation_height = 60
desired_observation_width = 76
class Carla(EnvironmentParameters):
type = 'Carla'
frame_skip = 1
observation_stack_size = 4
desired_observation_height = 128
desired_observation_width = 180
normalize_observation = False
server_height = 256
server_width = 360
config = 'environments/CarlaSettings.ini'
level = 'town1'
verbose = True
stereo = False
semantic_segmentation = False
depth = False
episode_max_time = 100000 # miliseconds for each episode
continuous_to_bool_threshold = 0.5
allow_braking = False
class Human(AgentParameters):
type = 'HumanAgent'
num_episodes_in_experience_replay = 10000000
class NStepQ(AgentParameters):
type = 'NStepQAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
loss_weights = [1.0]
optimizer_type = 'Adam'
num_steps_between_copying_online_weights_to_target = 1000
num_episodes_in_experience_replay = 2
apply_gradients_every_x_episodes = 1
num_steps_between_gradient_updates = 20 # this is called t_max in all the papers
hidden_layers_activation_function = 'elu'
targets_horizon = 'N-Step'
async_training = True
shared_optimizer = True
class DQN(AgentParameters):
type = 'DQNAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
loss_weights = [1.0]
optimizer_type = 'Adam'
num_steps_between_copying_online_weights_to_target = 1000
neon_support = True
async_training = True
shared_optimizer = True
class DDQN(DQN):
type = 'DDQNAgent'
num_steps_between_copying_online_weights_to_target = 30000
class DuelingDQN(DQN):
type = 'DQNAgent'
output_types = [OutputTypes.DuelingQ]
class BootstrappedDQN(DQN):
type = 'BootstrappedDQNAgent'
num_output_head_copies = 10
class CategoricalDQN(DQN):
type = 'CategoricalDQNAgent'
output_types = [OutputTypes.CategoricalQ]
v_min = -10.0
v_max = 10.0
atoms = 51
neon_support = False
class QuantileRegressionDQN(DQN):
type = 'QuantileRegressionDQNAgent'
output_types = [OutputTypes.QuantileRegressionQ]
atoms = 51
class NEC(AgentParameters):
type = 'NECAgent'
optimizer_type = 'Adam'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.DNDQ]
loss_weights = [1.0]
dnd_size = 500000
l2_norm_added_delta = 0.001
new_value_shift_coefficient = 0.1 # alpha
number_of_knn = 50
n_step = 100
bootstrap_total_return_from_old_policy = True
DND_key_error_threshold = 0
input_rescaler = 1.0
num_consecutive_playing_steps = 4
class ActorCritic(AgentParameters):
type = 'ActorCriticAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.Pi]
loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False]
num_episodes_in_experience_replay = 2
policy_gradient_rescaler = 'A_VALUE'
hidden_layers_activation_function = 'elu'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 5000 # this is called t_max in all the papers
gae_lambda = 0.96
shared_optimizer = True
estimate_value_using_gae = False
async_training = True
class PolicyGradient(AgentParameters):
type = 'PolicyGradientsAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Pi]
loss_weights = [1.0]
num_episodes_in_experience_replay = 2
policy_gradient_rescaler = 'FUTURE_RETURN_NORMALIZED_BY_TIMESTEP'
apply_gradients_every_x_episodes = 5
beta_entropy = 0
num_steps_between_gradient_updates = 20000 # this is called t_max in all the papers
async_training = True
class DDPG(AgentParameters):
type = 'DDPGAgent'
input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0]
hidden_layers_activation_function = 'relu'
num_episodes_in_experience_replay = 10000
num_steps_between_copying_online_weights_to_target = 1
rate_for_copying_weights_to_target = 0.001
shared_optimizer = True
async_training = True
class DDDPG(AgentParameters):
type = 'DDPGAgent'
input_types = {'observation': InputTypes.Observation, 'action': InputTypes.Action}
output_types = [OutputTypes.V] # V is used because we only want a single Q value
loss_weights = [1.0]
hidden_layers_activation_function = 'relu'
num_episodes_in_experience_replay = 10000
num_steps_between_copying_online_weights_to_target = 10
rate_for_copying_weights_to_target = 1
shared_optimizer = True
async_training = True
class NAF(AgentParameters):
type = 'NAFAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.NAF]
loss_weights = [1.0]
hidden_layers_activation_function = 'tanh'
num_consecutive_training_steps = 5
num_steps_between_copying_online_weights_to_target = 1
rate_for_copying_weights_to_target = 0.001
optimizer_type = 'RMSProp'
async_training = True
class PPO(AgentParameters):
type = 'PPOAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V]
loss_weights = [1.0]
hidden_layers_activation_function = 'tanh'
num_episodes_in_experience_replay = 1000000
policy_gradient_rescaler = 'A_VALUE'
gae_lambda = 0.96
target_kl_divergence = 0.01
initial_kl_coefficient = 1.0
high_kl_penalty_coefficient = 1000
add_a_normalized_timestep_to_the_observation = True
l2_regularization = 0#1e-3
value_targets_mix_fraction = 0.1
async_training = True
estimate_value_using_gae = True
step_until_collecting_full_episodes = True
class ClippedPPO(AgentParameters):
type = 'ClippedPPOAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.V, OutputTypes.PPO]
loss_weights = [0.5, 1.0]
stop_gradients_from_head = [False, False]
hidden_layers_activation_function = 'tanh'
num_episodes_in_experience_replay = 1000000
policy_gradient_rescaler = 'GAE'
gae_lambda = 0.95
target_kl_divergence = 0.01
initial_kl_coefficient = 1.0
high_kl_penalty_coefficient = 1000
add_a_normalized_timestep_to_the_observation = False
l2_regularization = 1e-3
value_targets_mix_fraction = 0.1
clip_likelihood_ratio_using_epsilon = 0.2
async_training = False
use_kl_regularization = False
estimate_value_using_gae = True
batch_size = 64
use_separate_networks_per_head = True
step_until_collecting_full_episodes = True
beta_entropy = 0.01
class DFP(AgentParameters):
type = 'DFPAgent'
input_types = {
'observation': InputTypes.Observation,
'measurements': InputTypes.Measurements,
'goal': InputTypes.GoalVector
}
output_types = [OutputTypes.MeasurementsPrediction]
loss_weights = [1.0]
use_measurements = True
num_predicted_steps_ahead = 6
goal_vector = [1.0, 1.0]
future_measurements_weights = [0.5, 0.5, 1.0]
async_training = True
class MMC(AgentParameters):
type = 'MixedMonteCarloAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
loss_weights = [1.0]
num_steps_between_copying_online_weights_to_target = 1000
monte_carlo_mixing_rate = 0.1
neon_support = True
class PAL(AgentParameters):
type = 'PALAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
loss_weights = [1.0]
pal_alpha = 0.9
persistent_advantage_learning = False
num_steps_between_copying_online_weights_to_target = 1000
neon_support = True
class BC(AgentParameters):
type = 'BCAgent'
input_types = {'observation': InputTypes.Observation}
output_types = [OutputTypes.Q]
loss_weights = [1.0]
collect_new_data = False
evaluate_every_x_training_iterations = 50000
class EGreedyExploration(ExplorationParameters):
policy = 'EGreedy'
initial_epsilon = 0.5
final_epsilon = 0.01
epsilon_decay_steps = 50000
evaluation_epsilon = 0.05
initial_noise_variance_percentage = 0.1
final_noise_variance_percentage = 0.1
noise_variance_decay_steps = 50000
class BootstrappedDQNExploration(ExplorationParameters):
policy = 'Bootstrapped'
architecture_num_q_heads = 10
bootstrapped_data_sharing_probability = 0.1
class OUExploration(ExplorationParameters):
policy = 'OUProcess'
mu = 0
theta = 0.15
sigma = 0.3
dt = 0.01
class AdditiveNoiseExploration(ExplorationParameters):
policy = 'AdditiveNoise'
initial_noise_variance_percentage = 0.1
final_noise_variance_percentage = 0.1
noise_variance_decay_steps = 50000
class EntropyExploration(ExplorationParameters):
policy = 'ContinuousEntropy'
class CategoricalExploration(ExplorationParameters):
policy = 'Categorical'
class Preset(GeneralParameters):
def __init__(self, agent, env, exploration, visualization=VisualizationParameters):
"""
:type agent: AgentParameters
:type env: EnvironmentParameters
:type exploration: ExplorationParameters
:type visualization: VisualizationParameters
"""
self.visualization = visualization
self.agent = agent
self.env = env
self.exploration = exploration