pre-release 0.10.0

2026-07-09 02:46:33 +02:00 · 2018-08-13 17:11:34 +03:00
parent d44c329bb8
commit 19ca5c24b1
485 changed files with 33292 additions and 16770 deletions
@@ -0,0 +1,42 @@
+# Exploration Policy
+
+An exploration policy is a module that is responsible for choosing the action according to the action values, the
+current phase, its internal state and the specific exploration policy algorithm.
+
+A custom exploration policy should implement both the exploration policy class and the exploration policy parameters
+class, which defines the parameters and the location of the exploration policy module.
+The parameters of the exploration policy class should match the parameters in the exploration policy parameters class.
+
+Exploration policies typically have some control parameter that defines its current exploration state, and
+a schedule for this parameter. This schedule can be defined using the Schedule class which is defined in
+exploration_policy.py.
+
+A custom implementation should look as follows:
+
+```
+class CustomExplorationParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        ...
+
+    @property
+    def path(self):
+        return 'module_path:class_name'
+
+
+class CustomExplorationPolicy(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, ...):
+        super().__init__(action_space)
+
+    def reset(self):
+        ...
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        ...
+
+    def change_phase(self, phase):
+        ...
+
+    def get_control_param(self):
+        ...
+```
@@ -0,0 +1,15 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
@@ -0,0 +1,95 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.schedules import Schedule, LinearSchedule
+from rl_coach.spaces import ActionSpace, BoxActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+# TODO: consider renaming to gaussian sampling
+class AdditiveNoiseParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
+        self.evaluation_noise_percentage = 0.05
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.additive_noise:AdditiveNoise'
+
+
+class AdditiveNoise(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
+                 evaluation_noise_percentage: float):
+        """
+        :param action_space: the action space used by the environment
+        :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
+                                          of the action space
+        :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
+        """
+        super().__init__(action_space)
+        self.noise_percentage_schedule = noise_percentage_schedule
+        self.evaluation_noise_percentage = evaluation_noise_percentage
+
+        if not isinstance(action_space, BoxActionSpace):
+            raise ValueError("Additive noise exploration works only for continuous controls."
+                             "The given action space is of type: {}".format(action_space.__class__.__name__))
+
+        if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
+                or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
+            raise ValueError("Additive noise exploration requires bounded actions")
+
+        # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        # TODO-potential-bug consider separating internally defined stdev and externally defined stdev into 2 policies
+
+        # set the current noise percentage
+        if self.phase == RunPhase.TEST:
+            current_noise_precentage = self.evaluation_noise_percentage
+        else:
+            current_noise_precentage = self.noise_percentage_schedule.current_value
+
+        # scale the noise to the action space range
+        action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
+
+        # extract the mean values
+        if isinstance(action_values, list):
+            # the action values are expected to be a list with the action mean and optionally the action stdev
+            action_values_mean = action_values[0].squeeze()
+        else:
+            # the action values are expected to be a numpy array representing the action mean
+            action_values_mean = action_values.squeeze()
+
+        # step the noise schedule
+        if self.phase == RunPhase.TRAIN:
+            self.noise_percentage_schedule.step()
+            # the second element of the list is assumed to be the standard deviation
+            if isinstance(action_values, list) and len(action_values) > 1:
+                action_values_std = action_values[1].squeeze()
+
+        # add noise to the action means
+        action = np.random.normal(action_values_mean, action_values_std)
+
+        return action
+
+    def get_control_param(self):
+        return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.schedules import Schedule
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+class BoltzmannParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        self.temperature_schedule = None
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.boltzmann:Boltzmann'
+
+
+
+class Boltzmann(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, temperature_schedule: Schedule):
+        """
+        :param action_space: the action space used by the environment
+        :param temperature_schedule: the schedule for the temperature parameter of the softmax
+        """
+        super().__init__(action_space)
+        self.temperature_schedule = temperature_schedule
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        if self.phase == RunPhase.TRAIN:
+            self.temperature_schedule.step()
+        # softmax calculation
+        exp_probabilities = np.exp(action_values / self.temperature_schedule.current_value)
+        probabilities = exp_probabilities / np.sum(exp_probabilities)
+        # make sure probs sum to 1
+        probabilities[-1] = 1 - np.sum(probabilities[:-1])
+        # choose actions according to the probabilities
+        return np.random.choice(range(self.action_space.shape), p=probabilities)
+
+    def get_control_param(self):
+        return self.temperature_schedule.current_value
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
+from rl_coach.schedules import Schedule, LinearSchedule
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
+
+
+class BootstrappedParameters(EGreedyParameters):
+    def __init__(self):
+        super().__init__()
+        self.architecture_num_q_heads = 10
+        self.bootstrapped_data_sharing_probability = 1.0
+        self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.bootstrapped:Bootstrapped'
+
+
+class Bootstrapped(EGreedy):
+    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
+                 architecture_num_q_heads: int,
+                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters(),):
+        """
+        :param action_space: the action space used by the environment
+        :param epsilon_schedule: a schedule for the epsilon values
+        :param evaluation_epsilon: the epsilon value to use for evaluation phases
+        :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
+                                                         if the e-greedy is used for a continuous policy
+        :param architecture_num_q_heads: the number of q heads to select from
+        """
+        super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
+        self.num_heads = architecture_num_q_heads
+        self.selected_head = 0
+        self.last_action_values = 0
+
+    def select_head(self):
+        self.selected_head = np.random.randint(self.num_heads)
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        # action values are none in case the exploration policy is going to select a random action
+        if action_values is not None:
+            if self.phase == RunPhase.TRAIN:
+                action_values = action_values[self.selected_head]
+            else:
+                # ensemble voting for evaluation
+                top_action_votings = np.argmax(action_values, axis=-1)
+                counts = np.bincount(top_action_votings.squeeze())
+                top_action = np.argmax(counts)
+                # convert the top action to a one hot vector and pass it to e-greedy
+                action_values = np.eye(len(self.action_space.actions))[[top_action]]
+        self.last_action_values = action_values
+        return super().get_action(action_values)
+
+    def get_control_param(self):
+        return self.selected_head
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+class CategoricalParameters(ExplorationParameters):
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.categorical:Categorical'
+
+
+class Categorical(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace):
+        """
+        :param action_space: the action space used by the environment
+        """
+        super().__init__(action_space)
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        if self.phase == RunPhase.TRAIN:
+            # choose actions according to the probabilities
+            return np.random.choice(self.action_space.actions, p=action_values)
+        else:
+            # take the action with the highest probability
+            return np.argmax(action_values)
+
+    def get_control_param(self):
+        return 0
@@ -0,0 +1,27 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from rl_coach.exploration_policies.additive_noise import AdditiveNoise, AdditiveNoiseParameters
+
+
+class ContinuousEntropyParameters(AdditiveNoiseParameters):
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.continuous_entropy:ContinuousEntropy'
+
+
+class ContinuousEntropy(AdditiveNoise):
+    pass
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.schedules import Schedule, LinearSchedule
+from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
+from rl_coach.utils import dynamic_import_and_instantiate_module_from_params
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy
+
+
+class EGreedyParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000)
+        self.evaluation_epsilon = 0.05
+        self.continuous_exploration_policy_parameters = AdditiveNoiseParameters()
+        self.continuous_exploration_policy_parameters.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
+        # for continuous control -
+        # (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.e_greedy:EGreedy'
+
+
+class EGreedy(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule,
+                 evaluation_epsilon: float,
+                 continuous_exploration_policy_parameters: ExplorationParameters=AdditiveNoiseParameters()):
+        """
+        :param action_space: the action space used by the environment
+        :param epsilon_schedule: a schedule for the epsilon values
+        :param evaluation_epsilon: the epsilon value to use for evaluation phases
+        :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
+                                                         if the e-greedy is used for a continuous policy
+        """
+        super().__init__(action_space)
+        self.epsilon_schedule = epsilon_schedule
+        self.evaluation_epsilon = evaluation_epsilon
+
+        if isinstance(self.action_space, BoxActionSpace):
+            # for continuous e-greedy (see http://www.cs.ubc.ca/~van/papers/2017-TOG-deepLoco/2017-TOG-deepLoco.pdf)
+            continuous_exploration_policy_parameters.action_space = action_space
+            self.continuous_exploration_policy = \
+                dynamic_import_and_instantiate_module_from_params(continuous_exploration_policy_parameters)
+
+        self.current_random_value = np.random.rand()
+
+    def requires_action_values(self):
+        epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
+        return self.current_random_value >= epsilon
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        epsilon = self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
+
+        if isinstance(self.action_space, DiscreteActionSpace):
+            top_action = np.argmax(action_values)
+            if self.current_random_value < epsilon:
+                chosen_action = self.action_space.sample()
+            else:
+                chosen_action = top_action
+        else:
+            if self.current_random_value < epsilon and self.phase == RunPhase.TRAIN:
+                chosen_action = self.action_space.sample()
+            else:
+                chosen_action = self.continuous_exploration_policy.get_action(action_values)
+
+        # step the epsilon schedule and generate a new random value for next time
+        if self.phase == RunPhase.TRAIN:
+            self.epsilon_schedule.step()
+        self.current_random_value = np.random.rand()
+        return chosen_action
+
+    def get_control_param(self):
+        if isinstance(self.action_space, DiscreteActionSpace):
+            return self.evaluation_epsilon if self.phase == RunPhase.TEST else self.epsilon_schedule.current_value
+        elif isinstance(self.action_space, BoxActionSpace):
+            return self.continuous_exploration_policy.get_control_param()
+
+    def change_phase(self, phase):
+        super().change_phase(phase)
+        if isinstance(self.action_space, BoxActionSpace):
+            self.continuous_exploration_policy.change_phase(phase)
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+from rl_coach.base_parameters import Parameters
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+
+
+class ExplorationParameters(Parameters):
+    def __init__(self):
+        self.action_space = None
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.exploration_policy:ExplorationPolicy'
+
+
+class ExplorationPolicy(object):
+    def __init__(self, action_space: ActionSpace):
+        """
+        :param action_space: the action space used by the environment
+        """
+        self.phase = RunPhase.HEATUP
+        self.action_space = action_space
+
+    def reset(self):
+        """
+        Used for resetting the exploration policy parameters when needed
+        :return: None
+        """
+        pass
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        """
+        Given a list of values corresponding to each action, 
+        choose one actions according to the exploration policy
+        :param action_values: A list of action values
+        :return: The chosen action
+        """
+        pass
+
+    def change_phase(self, phase):
+        """
+        Change between running phases of the algorithm
+        :param phase: Either Heatup or Train
+        :return: none
+        """
+        self.phase = phase
+
+    def requires_action_values(self) -> bool:
+        """
+        Allows exploration policies to define if they require the action values for the current step.
+        This can save up a lot of computation. For example in e-greedy, if the random value generated is smaller
+        than epsilon, the action is completely random, and the action values don't need to be calculated
+        :return: True if the action values are required. False otherwise
+        """
+        return True
+
+    def get_control_param(self):
+        return 0
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.spaces import ActionSpace, DiscreteActionSpace, BoxActionSpace
+
+from rl_coach.core_types import ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+class GreedyParameters(ExplorationParameters):
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.greedy:Greedy'
+
+
+class Greedy(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace):
+        """
+        :param action_space: the action space used by the environment
+        """
+        super().__init__(action_space)
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        if type(self.action_space) == DiscreteActionSpace:
+            return np.argmax(action_values)
+        if type(self.action_space) == BoxActionSpace:
+            return action_values
+
+    def get_control_param(self):
+        return 0
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.spaces import ActionSpace, BoxActionSpace, GoalsSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+# Based on on the description in:
+# https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
+class OUProcessParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        self.mu = 0
+        self.theta = 0.15
+        self.sigma = 0.2
+        self.dt = 0.01
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.ou_process:OUProcess'
+
+
+# Ornstein-Uhlenbeck process
+class OUProcess(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, mu: float=0, theta: float=0.15, sigma: float=0.2, dt: float=0.01):
+        """
+        :param action_space: the action space used by the environment
+        """
+        super().__init__(action_space)
+        self.mu = float(mu) * np.ones(self.action_space.shape)
+        self.theta = float(theta)
+        self.sigma = float(sigma) * np.ones(self.action_space.shape)
+        self.state = np.zeros(self.action_space.shape)
+        self.dt = dt
+
+        if not (isinstance(action_space, BoxActionSpace) or isinstance(action_space, GoalsSpace)):
+            raise ValueError("OU process exploration works only for continuous controls."
+                             "The given action space is of type: {}".format(action_space.__class__.__name__))
+
+    def reset(self):
+        self.state = np.zeros(self.action_space.shape)
+
+    def noise(self):
+        x = self.state
+        dx = self.theta * (self.mu - x) * self.dt + self.sigma * np.random.randn(len(x)) * np.sqrt(self.dt)
+        self.state = x + dx
+        return self.state
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        if self.phase == RunPhase.TRAIN:
+            noise = self.noise()
+        else:
+            noise = np.zeros(self.action_space.shape)
+
+        action = action_values.squeeze() + noise
+
+        return action
+
+    def get_control_param(self):
+        if self.phase == RunPhase.TRAIN:
+            return self.state
+        else:
+            return np.zeros(self.action_space.shape)
@@ -0,0 +1,100 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.schedules import Schedule, LinearSchedule
+from scipy.stats import truncnorm
+from rl_coach.spaces import ActionSpace, BoxActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType
+from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
+
+
+class TruncatedNormalParameters(ExplorationParameters):
+    def __init__(self):
+        super().__init__()
+        self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000)
+        self.evaluation_noise_percentage = 0.05
+        self.clip_low = 0
+        self.clip_high = 1
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.truncated_normal:TruncatedNormal'
+
+
+class TruncatedNormal(ExplorationPolicy):
+    def __init__(self, action_space: ActionSpace, noise_percentage_schedule: Schedule,
+                 evaluation_noise_percentage: float, clip_low: float, clip_high: float):
+        """
+        :param action_space: the action space used by the environment
+        :param noise_percentage_schedule: the schedule for the noise variance percentage relative to the absolute range
+                                          of the action space
+        :param evaluation_noise_percentage: the noise variance percentage that will be used during evaluation phases
+        """
+        super().__init__(action_space)
+        self.noise_percentage_schedule = noise_percentage_schedule
+        self.evaluation_noise_percentage = evaluation_noise_percentage
+        self.clip_low = clip_low
+        self.clip_high = clip_high
+
+        if not isinstance(action_space, BoxActionSpace):
+            raise ValueError("Truncated normal exploration works only for continuous controls."
+                             "The given action space is of type: {}".format(action_space.__class__.__name__))
+
+        if not np.all(-np.inf < action_space.high) or not np.all(action_space.high < np.inf)\
+                or not np.all(-np.inf < action_space.low) or not np.all(action_space.low < np.inf):
+            raise ValueError("Additive noise exploration requires bounded actions")
+
+        # TODO: allow working with unbounded actions by defining the noise in terms of range and not percentage
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        # set the current noise percentage
+        if self.phase == RunPhase.TEST:
+            current_noise_precentage = self.evaluation_noise_percentage
+        else:
+            current_noise_precentage = self.noise_percentage_schedule.current_value
+
+        # scale the noise to the action space range
+        action_values_std = current_noise_precentage * (self.action_space.high - self.action_space.low)
+
+        # extract the mean values
+        if isinstance(action_values, list):
+            # the action values are expected to be a list with the action mean and optionally the action stdev
+            action_values_mean = action_values[0].squeeze()
+        else:
+            # the action values are expected to be a numpy array representing the action mean
+            action_values_mean = action_values.squeeze()
+
+        # step the noise schedule
+        if self.phase == RunPhase.TRAIN:
+            self.noise_percentage_schedule.step()
+            # the second element of the list is assumed to be the standard deviation
+            if isinstance(action_values, list) and len(action_values) > 1:
+                action_values_std = action_values[1].squeeze()
+
+        # sample from truncated normal distribution
+        normalized_low = (self.clip_low - action_values_mean) / action_values_std
+        normalized_high = (self.clip_high - action_values_mean) / action_values_std
+        distribution = truncnorm(normalized_low, normalized_high, loc=action_values_mean, scale=action_values_std)
+        action = distribution.rvs(1)
+
+        return action
+
+    def get_control_param(self):
+        return np.ones(self.action_space.shape)*self.noise_percentage_schedule.current_value
@@ -0,0 +1,83 @@
+#
+# Copyright (c) 2017 Intel Corporation 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List
+
+import numpy as np
+from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
+from rl_coach.exploration_policies.e_greedy import EGreedy, EGreedyParameters
+from rl_coach.schedules import Schedule, LinearSchedule, PieceWiseSchedule
+from rl_coach.spaces import ActionSpace
+
+from rl_coach.core_types import RunPhase, ActionType, EnvironmentSteps
+from rl_coach.exploration_policies.exploration_policy import ExplorationParameters
+
+
+class UCBParameters(EGreedyParameters):
+    def __init__(self):
+        super().__init__()
+        self.architecture_num_q_heads = 10
+        self.bootstrapped_data_sharing_probability = 1.0
+        self.epsilon_schedule = PieceWiseSchedule([
+            (LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)),
+            (LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000))
+        ])
+        self.lamb = 0.1
+
+    @property
+    def path(self):
+        return 'rl_coach.exploration_policies.ucb:UCB'
+
+
+class UCB(EGreedy):
+    def __init__(self, action_space: ActionSpace, epsilon_schedule: Schedule, evaluation_epsilon: float,
+                 architecture_num_q_heads: int, lamb: int,
+                 continuous_exploration_policy_parameters: ExplorationParameters = AdditiveNoiseParameters()):
+        """
+        :param action_space: the action space used by the environment
+        :param epsilon_schedule: a schedule for the epsilon values
+        :param evaluation_epsilon: the epsilon value to use for evaluation phases
+        :param architecture_num_q_heads: the number of q heads to select from
+        :param lamb: lambda coefficient for taking the standard deviation into account
+        :param continuous_exploration_policy_parameters: the parameters of the continuous exploration policy to use
+                                                         if the e-greedy is used for a continuous policy
+        """
+        super().__init__(action_space, epsilon_schedule, evaluation_epsilon, continuous_exploration_policy_parameters)
+        self.num_heads = architecture_num_q_heads
+        self.lamb = lamb
+        self.std = 0
+        self.last_action_values = 0
+
+    def select_head(self):
+        pass
+
+    def get_action(self, action_values: List[ActionType]) -> ActionType:
+        # action values are none in case the exploration policy is going to select a random action
+        if action_values is not None:
+            if self.requires_action_values():
+                mean = np.mean(action_values, axis=0)
+                if self.phase == RunPhase.TRAIN:
+                    self.std = np.std(action_values, axis=0)
+                    self.last_action_values = mean + self.lamb * self.std
+                else:
+                    self.last_action_values = mean
+        return super().get_action(self.last_action_values)
+
+    def get_control_param(self):
+        if self.phase == RunPhase.TRAIN:
+            return np.mean(self.std)
+        else:
+            return 0