In this tutorial we'll build a new agent that implements the Categorical Deep Q Network algorithm (https://arxiv.org/pdf/1707.06887.pdf), and a preset that runs the agent on the breakout game of the Atari environment.

# The Agent

We'll start by defining a new head for the neural network used by this algorithm - ```CategoricalQHead```. 

A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through a neural network to produce the output of the network. There can be multiple heads in a network, and each one has an assigned loss function. The heads are algorithm dependent.

It will be defined in a new file - ```architectures/tensorflow_components/heads/categorical_dqn_head.py```.

First - some imports.

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
 sys.path.append(module_path)

import tensorflow as tf
from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters
from rl_coach.base_parameters import AgentParameters
from rl_coach.core_types import QActionStateValue
from rl_coach.spaces import SpacesDefinition

Now let's define a class - ```CategoricalQHeadParameters``` - containing the head parameters and the head itself. 

In [None]:
class CategoricalQHeadParameters(HeadParameters):
 def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):
 super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)

class CategoricalQHead(Head):
 def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,
 head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):
 super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)
 self.name = 'categorical_dqn_head'
 self.num_actions = len(self.spaces.action.actions)
 self.num_atoms = agent_parameters.algorithm.atoms
 self.return_type = QActionStateValue

 def _build_module(self, input_layer):
 self.actions = tf.placeholder(tf.int32, [None], name="actions")
 self.input = [self.actions]

 values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')
 values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,
 self.num_atoms))
 # softmax on atoms dimension
 self.output = tf.nn.softmax(values_distribution)

 # calculate cross entropy loss
 self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),
 name="distributions")
 self.target = self.distributions
 self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)
 tf.losses.add_loss(self.loss)

Now let's go ahead and define the network parameters - it will reuse the DQN network parameters but the head parameters will be our ```CategoricalQHeadParameters```

In [None]:
from rl_coach.agents.dqn_agent import DQNNetworkParameters


class CategoricalDQNNetworkParameters(DQNNetworkParameters):
 def __init__(self):
 super().__init__()
 self.heads_parameters = [CategoricalQHeadParameters()]

Next we'll define the algorithm parameters, which are the same as the DQN algorithm parameters, with the addition of the Categorical DQN specific v_min, v_max and number of atoms.
We'll also define the parameters of the exploration policy, which is epsilon greedy with epsilon starting at a value of 1.0 and decaying to 0.01 throughout 1,000,000 steps.

In [None]:
from rl_coach.agents.dqn_agent import DQNAlgorithmParameters
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.schedules import LinearSchedule


class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):
 def __init__(self):
 super().__init__()
 self.v_min = -10.0
 self.v_max = 10.0
 self.atoms = 51


class CategoricalDQNExplorationParameters(EGreedyParameters):
 def __init__(self):
 super().__init__()
 self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
 self.evaluation_epsilon = 0.001 

Now let's define the agent parameters class which contains all the parameters to be used by the agent - the network, algorithm and exploration parameters that we defined above, and also the parameters of the memory module to be used, which is experience replay in this case.

In [None]:
from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent
from rl_coach.base_parameters import AgentParameters
from rl_coach.core_types import StateType
from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters


class CategoricalDQNAgentParameters(AgentParameters):
 def __init__(self):
 super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),
 exploration=CategoricalDQNExplorationParameters(),
 memory=ExperienceReplayParameters(),
 networks={"main": CategoricalDQNNetworkParameters()})

 @property
 def path(self):
 return 'agents.categorical_dqn_agent:CategoricalDQNAgent'

The last step is to define the agent itself - ```CategoricalDQNAgent``` - which is a type of value optimization agent so it will inherit the ```ValueOptimizationAgent``` class. Our agent will implement the ```learn_from_batch``` function which updates the agent's networks according to an input batch of transitions.

In [None]:
from typing import Union


# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf
class CategoricalDQNAgent(ValueOptimizationAgent):
 def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
 super().__init__(agent_parameters, parent)
 self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)

 def distribution_prediction_to_q_values(self, prediction):
 return np.dot(prediction, self.z_values)

 # prediction's format is (batch,actions,atoms)
 def get_all_q_values_for_states(self, states: StateType):
 prediction = self.get_prediction(states)
 return self.distribution_prediction_to_q_values(prediction)

 def learn_from_batch(self, batch):
 network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()

 # for the action we actually took, the error is calculated by the atoms distribution
 # for all other actions, the error is 0
 distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([
 (self.networks['main'].target_network, batch.next_states(network_keys)),
 (self.networks['main'].online_network, batch.states(network_keys))
 ])

 # only update the action that we have actually done in this transition
 target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)
 m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))

 batches = np.arange(self.ap.network_wrappers['main'].batch_size)
 for j in range(self.z_values.size):
 tzj = np.fmax(np.fmin(batch.rewards() +
 (1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],
 self.z_values[self.z_values.size - 1]),
 self.z_values[0])
 bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
 u = (np.ceil(bj)).astype(int)
 l = (np.floor(bj)).astype(int)
 m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))
 m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))
 # total_loss = cross entropy between actual result above and predicted result for the given action
 TD_targets[batches, batch.actions()] = m

 result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)
 total_loss, losses, unclipped_grads = result[:3]

 return total_loss, losses, unclipped_grads

# The Preset

The new preset will be defined in a new file - ```presets/atari_categorical_dqn.py```.


First - let's define the agent parameters

In [None]:
from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAgentParameters


agent_params = CategoricalDQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.00025

Environment parameters

In [None]:
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection


env_params = Atari()
env_params.level = SingleLevelSelection(atari_deterministic_v4)

Schedule and visualization parameters

In [None]:
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.core_types import EnvironmentSteps, RunPhase
from rl_coach.base_parameters import VisualizationParameters


schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(50000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)
schedule_params.evaluation_steps = EnvironmentSteps(135000)
schedule_params.heatup_steps = EnvironmentSteps(50000)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
vis_params.dump_mp4 = False

Connecting all the dots together - we'll define a graph manager with the Categorial DQN agent parameters, the Atari environment parameters, and the scheduling and visualization parameters defined above

In [None]:
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager


graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
 schedule_params=schedule_params, vis_params=vis_params)
graph_manager.env_params.level.select('breakout')
graph_manager.visualization_parameters.render = True

# Running the Preset
(this is normally done from command line by running ```coach -p Atari_C51 ... ```)

In [None]:
from rl_coach.base_parameters import TaskParameters, Frameworks

log_path = '../experiments/atari_categorical_dqn'
if not os.path.exists(log_path):
 os.makedirs(log_path)
 
task_parameters = TaskParameters(framework_type="tensorflow", 
 evaluate_only=False,
 experiment_path=log_path)

task_parameters.__dict__['save_checkpoint_secs'] = None

graph_manager.create_graph(task_parameters)

# let the adventure begin
graph_manager.improve()
