In this tutorial we'll add the DeepMind Control Suite environment to Coach, and create a preset that trains the DDPG agent on the new environment.

# Setup
First, follow the installation instructions here: https://github.com/deepmind/dm_control#installation-and-requirements. 


Make sure your ```LD_LIBRARY_PATH``` contains the path to the GLEW and LGFW libraries (https://github.com/openai/mujoco-py/issues/110).


In addition, Mujoco rendering might need to be disabled (https://github.com/deepmind/dm_control/issues/20)

In [None]:
import os
#os.environ['DISABLE_MUJOCO_RENDERING'] = '1'

import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# The Environment Wrapper

To integrate an environment with Coach, we need to implement an environment wrapper which is placed under the environments folder. In our case, we'll implement the ```control_suite_environment.py``` file.


We'll start with some helper classes - ```ObservationType``` and ```ControlSuiteEnvironmentParameters```.

In [None]:
from enum import Enum
from dm_control import suite
from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter



class ObservationType(Enum):
    Measurements = 1
    Image = 2
    Image_and_Measurements = 3


# Parameters
class ControlSuiteEnvironmentParameters(EnvironmentParameters):
    def __init__(self):
        super().__init__()
        self.observation_type = ObservationType.Measurements
        self.default_input_filter = ControlSuiteInputFilter
        self.default_output_filter = ControlSuiteOutputFilter

    @property
    def path(self):
        return 'environments.control_suite_environment:ControlSuiteEnvironment'


"""
ControlSuite Environment Components
"""
ControlSuiteInputFilter = NoInputFilter()
ControlSuiteOutputFilter = NoOutputFilter()

control_suite_envs = {':'.join(env): ':'.join(env) for env in suite.BENCHMARKING}

Now let's define the control suite's environment wrapper class.

In the ```__init__``` function we'll load and initialize the environment, and the internal state and action space members which will make sure the states and actions are within their allowed limits.

In [None]:
import numpy as np
import random
from typing import Union
from rl_coach.base_parameters import VisualizationParameters
from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, VectorObservationSpace, StateSpace
from dm_control.suite.wrappers import pixels


# Environment
class ControlSuiteEnvironment(Environment):
    def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,
                 seed: Union[None, int]=None, human_control: bool=False,
                 observation_type: ObservationType=ObservationType.Measurements,
                 custom_reward_threshold: Union[int, float]=None, **kwargs):
        super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)

        self.observation_type = observation_type

        # load and initialize environment
        domain_name, task_name = self.env_id.split(":")
        self.env = suite.load(domain_name=domain_name, task_name=task_name)

        if observation_type != ObservationType.Measurements:
            self.env = pixels.Wrapper(self.env, pixels_only=observation_type == ObservationType.Image)

        # seed
        if self.seed is not None:
            np.random.seed(self.seed)
            random.seed(self.seed)

        self.state_space = StateSpace({})

        # image observations
        if observation_type != ObservationType.Measurements:
            self.state_space['pixels'] = ImageObservationSpace(shape=self.env.observation_spec()['pixels'].shape,
                                                               high=255)

        # measurements observations
        if observation_type != ObservationType.Image:
            measurements_space_size = 0
            measurements_names = []
            for observation_space_name, observation_space in self.env.observation_spec().items():
                if len(observation_space.shape) == 0:
                    measurements_space_size += 1
                    measurements_names.append(observation_space_name)
                elif len(observation_space.shape) == 1:
                    measurements_space_size += observation_space.shape[0]
                    measurements_names.extend(["{}_{}".format(observation_space_name, i) for i in
                                               range(observation_space.shape[0])])
            self.state_space['measurements'] = VectorObservationSpace(shape=measurements_space_size,
                                                                      measurements_names=measurements_names)

        # actions
        self.action_space = BoxActionSpace(
            shape=self.env.action_spec().shape[0],
            low=self.env.action_spec().minimum,
            high=self.env.action_spec().maximum
        )

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)

The following functions cover the API expected from a new environment wrapper:

1. ```_update_state``` - update the internal state of the wrapper (to be queried by the agent)
2. ```_take_action``` - take an action on the environment 
3. ```_restart_environment_episode``` - restart the environment on a new episode 
4. ```get_rendered_image``` - get a rendered image of the environment in its current state

In [None]:
class ControlSuiteEnvironment(Environment):
    def _update_state(self):
        self.state = {}

        if self.observation_type != ObservationType.Measurements:
            self.pixels = self.last_result.observation['pixels']
            self.state['pixels'] = self.pixels

        if self.observation_type != ObservationType.Image:
            self.measurements = np.array([])
            for sub_observation in self.last_result.observation.values():
                if isinstance(sub_observation, np.ndarray) and len(sub_observation.shape) == 1:
                    self.measurements = np.concatenate((self.measurements, sub_observation))
                else:
                    self.measurements = np.concatenate((self.measurements, np.array([sub_observation])))
            self.state['measurements'] = self.measurements

        self.reward = self.last_result.reward if self.last_result.reward is not None else 0

        self.done = self.last_result.last()

    def _take_action(self, action):
        if type(self.action_space) == BoxActionSpace:
            action = self.action_space.clip_action_to_space(action)

        self.last_result = self.env.step(action)

    def _restart_environment_episode(self, force_environment_reset=False):
        self.last_result = self.env.reset()

    def get_rendered_image(self):
        return self.env.physics.render(camera_id=0)

# The Preset
The new preset will be defined in a new file - ```presets\ControlSuite_DDPG.py```. 

First - let's define the agent parameters

In [None]:
from rl_coach.agents.ddpg_agent import DDPGAgentParameters
from rl_coach.architectures.tensorflow_components.architecture import Dense
from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
from rl_coach.environments.gym_environment import MujocoInputFilter
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter


agent_params = DDPGAgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \
    agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')
agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \
    agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')
agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'].scheme = [Dense([300])]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([200])]
agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'].scheme = [Dense([400])]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])]
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty
agent_params.input_filter = MujocoInputFilter()
agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1/10.))

Now let's define the environment parameters

In [None]:
from rl_coach.environments.control_suite_environment import ControlSuiteEnvironmentParameters, control_suite_envs
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection

env_params = ControlSuiteEnvironmentParameters()
env_params.level = SingleLevelSelection(control_suite_envs)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
vis_params.dump_mp4 = False

The schedule parameters will define the number of heatup steps, periodice evaluation steps, training steps between evaluations.

In [None]:
from rl_coach.graph_managers.graph_manager import ScheduleParameters


schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(1000)

Finally, we'll create and run the graph manager

In [None]:
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.base_parameters import TaskParameters, Frameworks


graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params)

graph_manager.env_params.level.select('walker:walk')
graph_manager.visualization_parameters.render = True


log_path = '../experiments/control_suite_walker_ddpg'
if not os.path.exists(log_path):
    os.makedirs(log_path)
    
task_parameters = TaskParameters(framework_type="tensorflow", 
                                evaluate_only=False,
                                experiment_path=log_path)

task_parameters.__dict__['save_checkpoint_secs'] = None


graph_manager.create_graph(task_parameters)

# let the adventure begin
graph_manager.improve()
