From 2046358ab0633f5b2aac71159d16b43354f5002c Mon Sep 17 00:00:00 2001
From: Sina Afrooze <sina.beh@gmail.com>
Date: Tue, 30 Oct 2018 02:02:37 -0700
Subject: [PATCH] Add docstring for architecture (#47)

- Removed get_model() from architecture because it is only implementation detail of architecture.
---
 rl_coach/architectures/architecture.py        | 153 ++++++++++++++++--
 .../tensorflow_components/architecture.py     |   8 +
 2 files changed, 146 insertions(+), 15 deletions(-)

diff --git a/rl_coach/architectures/architecture.py b/rl_coach/architectures/architecture.py
index 2d0377a..1ae2d47 100644
--- a/rl_coach/architectures/architecture.py
+++ b/rl_coach/architectures/architecture.py
@@ -14,6 +14,10 @@
 # limitations under the License.
 #
 
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+
 from rl_coach.base_parameters import AgentParameters
 from rl_coach.spaces import SpacesDefinition
 
@@ -21,15 +25,15 @@ from rl_coach.spaces import SpacesDefinition
 class Architecture(object):
     def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, name: str= ""):
         """
+        Creates a neural network 'architecture', that can be trained and used for inference.
+
         :param agent_parameters: the agent parameters
         :param spaces: the spaces (observation, action, etc.) definition of the agent
         :param name: the name of the network
         """
-        # spaces
         self.spaces = spaces
-
         self.name = name
-        self.network_wrapper_name = self.name.split('/')[0]  # the name can be main/online and the network_wrapper_name will be main
+        self.network_wrapper_name = self.name.split('/')[0]  # e.g. 'main/online' --> 'main'
         self.full_name = "{}/{}".format(agent_parameters.full_name_id, name)
         self.network_parameters = agent_parameters.network_wrappers[self.network_wrapper_name]
         self.batch_size = self.network_parameters.batch_size
@@ -37,35 +41,154 @@ class Architecture(object):
         self.optimizer = None
         self.ap = agent_parameters
 
-    def get_model(self):
+    def predict(self, inputs: Dict[str, np.ndarray]) -> List[np.ndarray]:
+        """
+        Given input observations, use the model to make predictions (e.g. action or value).
+
+        :param inputs: current state (i.e. observations, measurements, goals, etc.)
+            (e.g. `{'observation': numpy.ndarray}` of shape (batch_size, observation_space_size))
+        :return: predictions of action or value of shape (batch_size, action_space_size) for action predictions)
+        """
         pass
 
-    def predict(self, inputs):
+    def train_on_batch(self,
+                       inputs: Dict[str, np.ndarray],
+                       targets: List[np.ndarray],
+                       scaler: float=1.,
+                       additional_fetches: list=None,
+                       importance_weights: np.ndarray=None) -> tuple:
+        """
+        Given a batch of inputs (e.g. states) and targets (e.g. discounted rewards), takes a training step: i.e. runs a
+        forward pass and backward pass of the network, accumulates the gradients and applies an optimization step to
+        update the weights.
+        Calls `accumulate_gradients` followed by `apply_and_reset_gradients`.
+        Note: Currently an unused method.
+
+        :param inputs: typically the environment states (but can also contain other data necessary for loss).
+            (e.g. `{'observation': numpy.ndarray}` with `observation` of shape (batch_size, observation_space_size) or
+            (batch_size, observation_space_size, stack_size) or
+            `{'observation': numpy.ndarray, 'output_0_0': numpy.ndarray}` with `output_0_0` of shape (batch_size,))
+        :param targets: target values of shape (batch_size, ). For example discounted rewards for value network
+            for calculating the value-network loss would be a target. Length of list and order of arrays in
+            the list matches that of network losses which are defined by network parameters
+        :param scaler: value to scale gradients by before optimizing network weights
+        :param additional_fetches: list of additional values to fetch and return. The type of each list
+            element is framework dependent.
+        :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss.
+        :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
+            total_loss (float): sum of all head losses
+            losses (list of float): list of all losses. The order is list of target losses followed by list
+                of regularization losses. The specifics of losses is dependant on the network parameters
+                (number of heads, etc.)
+            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
+            fetched_tensors: all values for additional_fetches
+        """
         pass
 
-    def train_on_batch(self, inputs, targets):
+    def get_weights(self) -> List[np.ndarray]:
+        """
+        Gets model weights as a list of ndarrays. It is used for synchronizing weight between two identical networks.
+
+        :return: list weights as ndarray
+        """
         pass
 
-    def get_weights(self):
+    def set_weights(self, weights: List[np.ndarray], rate: float=1.0) -> None:
+        """
+        Sets model weights for provided layer parameters.
+
+        :param weights: list of model weights in the same order as received in get_weights
+        :param rate: controls the mixture of given weight values versus old weight values.
+            i.e. new_weight = rate * given_weight + (1 - rate) * old_weight
+        :return: None
+        """
         pass
 
-    def set_weights(self, weights, rate=1.0):
+    def reset_accumulated_gradients(self) -> None:
+        """
+        Sets gradient of all parameters to 0.
+
+        Once gradients are reset, they must be accessible by `accumulated_gradients` property of this class,
+        which must return a list of numpy ndarrays. Child class must ensure that `accumulated_gradients` is set.
+        """
         pass
 
-    def reset_accumulated_gradients(self):
+    def accumulate_gradients(self,
+                             inputs: Dict[str, np.ndarray],
+                             targets: List[np.ndarray],
+                             additional_fetches: list=None,
+                             importance_weights: np.ndarray=None,
+                             no_accumulation: bool=False) ->\
+            Tuple[float, List[float], float, list]:
+        """
+        Given a batch of inputs (i.e. states) and targets (e.g. discounted rewards), computes and accumulates the
+        gradients for model parameters. Will run forward and backward pass to compute gradients, clip the gradient
+        values if required and then accumulate gradients from all learners. It does not update the model weights,
+        that's performed in `apply_and_reset_gradients` method.
+
+        Once gradients are accumulated, they are accessed by `accumulated_gradients` property of this class.å
+
+        :param inputs: typically the environment states (but can also contain other data for loss)
+            (e.g. `{'observation': numpy.ndarray}` with `observation` of shape (batch_size, observation_space_size) or
+             (batch_size, observation_space_size, stack_size) or
+            `{'observation': numpy.ndarray, 'output_0_0': numpy.ndarray}` with `output_0_0` of shape (batch_size,))
+        :param targets: targets for calculating loss. For example discounted rewards for value network
+            for calculating the value-network loss would be a target. Length of list and order of arrays in
+            the list matches that of network losses which are defined by network parameters
+        :param additional_fetches: list of additional values to fetch and return. The type of each list
+            element is framework dependent.
+        :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss.
+        :param no_accumulation: if True, set gradient values to the new gradients, otherwise sum with previously
+            calculated gradients
+        :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors
+            total_loss (float): sum of all head losses
+            losses (list of float): list of all losses. The order is list of target losses followed by list of regularization losses.
+                The specifics of losses is dependant on the network parameters (number of heads, etc.)
+            norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied
+            fetched_tensors: all values for additional_fetches
+        """
         pass
 
-    def accumulate_gradients(self, inputs, targets):
+    def apply_and_reset_gradients(self, gradients: List[np.ndarray]) -> None:
+        """
+        Applies the given gradients to the network weights and resets the gradient accumulations.
+        Has the same impact as calling `apply_gradients`, then `reset_accumulated_gradients`.
+
+        :param gradients: gradients for the parameter weights, taken from `accumulated_gradients` property
+            of an identical network (either self or another identical network)
+        """
         pass
 
-    def apply_and_reset_gradients(self, gradients):
+    def apply_gradients(self, gradients: List[np.ndarray]) -> None:
+        """
+        Applies the given gradients to the network weights.
+        Will be performed sync or async depending on `network_parameters.async_training`
+
+        :param gradients: gradients for the parameter weights, taken from `accumulated_gradients` property
+            of an identical network (either self or another identical network)
+        """
         pass
 
-    def apply_gradients(self, gradients):
+    def get_variable_value(self, variable: Any) -> np.ndarray:
+        """
+        Gets value of a specified variable. Type of variable is dependant on the framework.
+        Example of a variable is head.kl_coefficient, which could be a symbol for evaluation
+        or could be a string representing the value.
+
+        :param variable: variable of interest
+        :return: value of the specified variable
+        """
         pass
 
-    def get_variable_value(self, variable):
-        pass
+    def set_variable_value(self, assign_op: Any, value: np.ndarray, placeholder: Any):
+        """
+        Updates the value of a specified variable. Type of assign_op is dependant on the framework
+        and is a unique identifier for assigning value to a variable. For example an agent may use
+        head.assign_kl_coefficient. There is a one to one mapping between assign_op and placeholder
+        (in the example above, placeholder would be head.kl_coefficient_ph).
 
-    def set_variable_value(self, assign_op, value, placeholder=None):
+        :param assign_op: a parameter representing the operation for assigning value to a specific variable
+        :param value: value of the specified variable used for update
+        :param placeholder: a placeholder for binding the value to assign_op.
+        """
         pass
diff --git a/rl_coach/architectures/tensorflow_components/architecture.py b/rl_coach/architectures/tensorflow_components/architecture.py
index e731920..7c5c248 100644
--- a/rl_coach/architectures/tensorflow_components/architecture.py
+++ b/rl_coach/architectures/tensorflow_components/architecture.py
@@ -146,6 +146,14 @@ class TensorFlowArchitecture(Architecture):
             # set the fetches for training
             self._set_initial_fetch_list()
 
+    def get_model(self) -> None:
+        """
+        Constructs the model using `network_parameters` and sets `input_embedders`, `middleware`,
+        `output_heads`, `outputs`, `losses`, `total_loss`, `adaptive_learning_rate_scheme`,
+        `current_learning_rate`, and `optimizer`
+        """
+        raise NotImplementedError
+
     def _set_initial_fetch_list(self):
         """
         Create an initial list of tensors to fetch in each training iteration