coach/rl_coach/agents/dfp_agent.py

#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import copy
from enum import Enum
from typing import Union

import numpy as np

from rl_coach.agents.agent import Agent
from rl_coach.architectures.head_parameters import MeasurementsPredictionHeadParameters
from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.architectures.tensorflow_components.layers import Conv2d, Dense
from rl_coach.base_parameters import AlgorithmParameters, AgentParameters, NetworkParameters, \
     MiddlewareScheme
from rl_coach.core_types import ActionInfo, EnvironmentSteps, RunPhase
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.spaces import SpacesDefinition, VectorObservationSpace


class HandlingTargetsAfterEpisodeEnd(Enum):
    LastStep = 0
    NAN = 1


class DFPNetworkParameters(NetworkParameters):
    def __init__(self):
        super().__init__()
        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
                                            'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
                                            'goal': InputEmbedderParameters(activation_function='leaky_relu')}

        self.input_embedders_parameters['observation'].scheme = [
            Conv2d(32, 8, 4),
            Conv2d(64, 4, 2),
            Conv2d(64, 3, 1),
            Dense(512),
        ]

        self.input_embedders_parameters['measurements'].scheme = [
            Dense(128),
            Dense(128),
            Dense(128),
        ]

        self.input_embedders_parameters['goal'].scheme = [
            Dense(128),
            Dense(128),
            Dense(128),
        ]

        self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
                                                            scheme=MiddlewareScheme.Empty)
        self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
        self.async_training = False
        self.batch_size = 64
        self.adam_optimizer_beta1 = 0.95


class DFPMemoryParameters(EpisodicExperienceReplayParameters):
    def __init__(self):
        self.max_size = (MemoryGranularity.Transitions, 20000)
        self.shared_memory = True
        super().__init__()


class DFPAlgorithmParameters(AlgorithmParameters):
    """
    :param num_predicted_steps_ahead: (int)
        Number of future steps to predict measurements for. The future steps won't be sequential, but rather jump
        in multiples of 2. For example, if num_predicted_steps_ahead = 3, then the steps will be: t+1, t+2, t+4.
        The predicted steps will be [t + 2**i for i in range(num_predicted_steps_ahead)]

    :param goal_vector: (List[float])
        The goal vector will weight each of the measurements to form an optimization goal. The vector should have
        the same length as the number of measurements, and it will be vector multiplied by the measurements.
        Positive values correspond to trying to maximize the particular measurement, and negative values
        correspond to trying to minimize the particular measurement.

    :param future_measurements_weights: (List[float])
        The future_measurements_weights weight the contribution of each of the predicted timesteps to the optimization
        goal. For example, if there are 6 steps predicted ahead, and a future_measurements_weights vector with 3 values,
        then only the 3 last timesteps will be taken into account, according to the weights in the
        future_measurements_weights vector.

    :param use_accumulated_reward_as_measurement: (bool)
        If set to True, the accumulated reward from the beginning of the episode will be added as a measurement to
        the measurements vector in the state. This van be useful in environments where the given measurements don't
        include enough information for the particular goal the agent should achieve.

    :param handling_targets_after_episode_end: (HandlingTargetsAfterEpisodeEnd)
        Dictates how to handle measurements that are outside the episode length.

    :param scale_measurements_targets: (Dict[str, float])
        Allows rescaling the values of each of the measurements available. This van be useful when the measurements
        have a different scale and you want to normalize them to the same scale.
    """
    def __init__(self):
        super().__init__()
        self.num_predicted_steps_ahead = 6
        self.goal_vector = [1.0, 1.0]
        self.future_measurements_weights = [0.5, 0.5, 1.0]
        self.use_accumulated_reward_as_measurement = False
        self.handling_targets_after_episode_end = HandlingTargetsAfterEpisodeEnd.NAN
        self.scale_measurements_targets = {}
        self.num_consecutive_playing_steps = EnvironmentSteps(8)


class DFPAgentParameters(AgentParameters):
    def __init__(self):
        super().__init__(algorithm=DFPAlgorithmParameters(),
                         exploration=EGreedyParameters(),
                         memory=DFPMemoryParameters(),
                         networks={"main": DFPNetworkParameters()})

    @property
    def path(self):
        return 'rl_coach.agents.dfp_agent:DFPAgent'


# Direct Future Prediction Agent - http://vladlen.info/papers/learning-to-act.pdf
class DFPAgent(Agent):
    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
        super().__init__(agent_parameters, parent)
        self.current_goal = self.ap.algorithm.goal_vector
        self.target_measurements_scale_factors = None

    @property
    def is_on_policy(self) -> bool:
        # This is only somewhat correct as the algorithm uses a very small (20k) ER keeping only recent samples seen.
        # So, it is approximately on-policy (although if too be completely strict it is off-policy)
        return True

    def learn_from_batch(self, batch):
        network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()

        network_inputs = batch.states(network_keys)
        network_inputs['goal'] = np.repeat(np.expand_dims(self.current_goal, 0),
                                           batch.size, axis=0)

        # get the current outputs of the network
        targets = self.networks['main'].online_network.predict(network_inputs)

        # change the targets for the taken actions
        for i in range(batch.size):
            targets[i, batch.actions()[i]] = batch[i].info['future_measurements'].flatten()

        result = self.networks['main'].train_and_sync_networks(network_inputs, targets)
        total_loss, losses, unclipped_grads = result[:3]

        return total_loss, losses, unclipped_grads

    def choose_action(self, curr_state):
        if self.exploration_policy.requires_action_values():
            # predict the future measurements
            tf_input_state = self.prepare_batch_for_inference(curr_state, 'main')
            tf_input_state['goal'] = np.expand_dims(self.current_goal, 0)
            measurements_future_prediction = self.networks['main'].online_network.predict(tf_input_state)[0]
            action_values = np.zeros(len(self.spaces.action.actions))
            num_steps_used_for_objective = len(self.ap.algorithm.future_measurements_weights)

            # calculate the score of each action by multiplying it's future measurements with the goal vector
            for action_idx in range(len(self.spaces.action.actions)):
                action_measurements = measurements_future_prediction[action_idx]
                action_measurements = np.reshape(action_measurements,
                                                 (self.ap.algorithm.num_predicted_steps_ahead,
                                                  self.spaces.state['measurements'].shape[0]))
                future_steps_values = np.dot(action_measurements, self.current_goal)
                action_values[action_idx] = np.dot(future_steps_values[-num_steps_used_for_objective:],
                                                   self.ap.algorithm.future_measurements_weights)
        else:
            action_values = None

        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
        action, _ = self.exploration_policy.get_action(action_values)

        if action_values is not None:
            action_values = action_values.squeeze()
            action_info = ActionInfo(action=action, action_value=action_values[action])
        else:
            action_info = ActionInfo(action=action)

        return action_info

    def set_environment_parameters(self, spaces: SpacesDefinition):
        self.spaces = copy.deepcopy(spaces)
        self.spaces.goal = VectorObservationSpace(shape=self.spaces.state['measurements'].shape,
                                                  measurements_names=
                                                  self.spaces.state['measurements'].measurements_names)

        # if the user has filled some scale values, check that he got the names right
        if set(self.spaces.state['measurements'].measurements_names).intersection(
                self.ap.algorithm.scale_measurements_targets.keys()) !=\
                set(self.ap.algorithm.scale_measurements_targets.keys()):
            raise ValueError("Some of the keys in parameter scale_measurements_targets ({})  are not defined in "
                             "the measurements space {}".format(self.ap.algorithm.scale_measurements_targets.keys(),
                                                                self.spaces.state['measurements'].measurements_names))

        super().set_environment_parameters(self.spaces)

        # the below is done after calling the base class method, as it might add accumulated reward as a measurement

        # fill out the missing measurements scale factors
        for measurement_name in self.spaces.state['measurements'].measurements_names:
            if measurement_name not in self.ap.algorithm.scale_measurements_targets:
                self.ap.algorithm.scale_measurements_targets[measurement_name] = 1

        self.target_measurements_scale_factors = \
            np.array([self.ap.algorithm.scale_measurements_targets[measurement_name] for measurement_name in
                      self.spaces.state['measurements'].measurements_names])

    def handle_episode_ended(self):
        last_episode = self.current_episode_buffer
        if self.phase in [RunPhase.TRAIN, RunPhase.HEATUP] and last_episode:
            self._update_measurements_targets(last_episode,
                                              self.ap.algorithm.num_predicted_steps_ahead)
        super().handle_episode_ended()

    def _update_measurements_targets(self, episode, num_steps):
        if 'measurements' not in episode.transitions[0].state or episode.transitions[0].state['measurements'] == []:
            raise ValueError("Measurements are not present in the transitions of the last episode played. ")
        measurements_size = self.spaces.state['measurements'].shape[0]
        for transition_idx, transition in enumerate(episode.transitions):
            transition.info['future_measurements'] = np.zeros((num_steps, measurements_size))
            for step in range(num_steps):
                offset_idx = transition_idx + 2 ** step

                if offset_idx >= episode.length():
                    if self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.NAN:
                        # the special MSE loss will ignore those entries so that the gradient will be 0 for these
                        transition.info['future_measurements'][step] = np.nan
                        continue

                    elif self.ap.algorithm.handling_targets_after_episode_end == HandlingTargetsAfterEpisodeEnd.LastStep:
                        offset_idx = - 1

                transition.info['future_measurements'][step] = \
                    self.target_measurements_scale_factors * \
                    (episode.transitions[offset_idx].state['measurements'] - transition.state['measurements'])