mirror of
https://github.com/gryf/coach.git
synced 2025-12-17 11:10:20 +01:00
pre-release 0.10.0
This commit is contained in:
407
tutorials/1. Implementing an Algorithm.ipynb
Normal file
407
tutorials/1. Implementing an Algorithm.ipynb
Normal file
@@ -0,0 +1,407 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we'll build a new agent that implements the Categorical Deep Q Network algorithm (https://arxiv.org/pdf/1707.06887.pdf), and a preset that runs the agent on the breakout game of the Atari environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We'll start by defining a new head for the neural network used by this algorithm - ```CategoricalQHead```. \n",
|
||||
"\n",
|
||||
"A head is the final part of the network. It takes the embedding from the middleware embedder and passes it through a neural network to produce the output of the network. There can be multiple heads in a network, and each one has an assigned loss function. The heads are algorithm dependent.\n",
|
||||
"\n",
|
||||
"It will be defined in a new file - ```architectures/tensorflow_components/heads/categorical_dqn_head.py```.\n",
|
||||
"\n",
|
||||
"First - some imports."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
"\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from rl_coach.architectures.tensorflow_components.heads.head import Head, HeadParameters\n",
|
||||
"from rl_coach.base_parameters import AgentParameters\n",
|
||||
"from rl_coach.core_types import QActionStateValue\n",
|
||||
"from rl_coach.spaces import SpacesDefinition"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's define a class - ```CategoricalQHeadParameters``` - containing the head parameters and the head itself. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class CategoricalQHeadParameters(HeadParameters):\n",
|
||||
" def __init__(self, activation_function: str ='relu', name: str='categorical_q_head_params'):\n",
|
||||
" super().__init__(parameterized_class=CategoricalQHead, activation_function=activation_function, name=name)\n",
|
||||
"\n",
|
||||
"class CategoricalQHead(Head):\n",
|
||||
" def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str,\n",
|
||||
" head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str ='relu'):\n",
|
||||
" super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function)\n",
|
||||
" self.name = 'categorical_dqn_head'\n",
|
||||
" self.num_actions = len(self.spaces.action.actions)\n",
|
||||
" self.num_atoms = agent_parameters.algorithm.atoms\n",
|
||||
" self.return_type = QActionStateValue\n",
|
||||
"\n",
|
||||
" def _build_module(self, input_layer):\n",
|
||||
" self.actions = tf.placeholder(tf.int32, [None], name=\"actions\")\n",
|
||||
" self.input = [self.actions]\n",
|
||||
"\n",
|
||||
" values_distribution = tf.layers.dense(input_layer, self.num_actions * self.num_atoms, name='output')\n",
|
||||
" values_distribution = tf.reshape(values_distribution, (tf.shape(values_distribution)[0], self.num_actions,\n",
|
||||
" self.num_atoms))\n",
|
||||
" # softmax on atoms dimension\n",
|
||||
" self.output = tf.nn.softmax(values_distribution)\n",
|
||||
"\n",
|
||||
" # calculate cross entropy loss\n",
|
||||
" self.distributions = tf.placeholder(tf.float32, shape=(None, self.num_actions, self.num_atoms),\n",
|
||||
" name=\"distributions\")\n",
|
||||
" self.target = self.distributions\n",
|
||||
" self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.target, logits=values_distribution)\n",
|
||||
" tf.losses.add_loss(self.loss)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's go ahead and define the network parameters - it will reuse the DQN network parameters but the head parameters will be our ```CategoricalQHeadParameters```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.agents.dqn_agent import DQNNetworkParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CategoricalDQNNetworkParameters(DQNNetworkParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.heads_parameters = [CategoricalQHeadParameters()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next we'll define the algorithm parameters, which are the same as the DQN algorithm parameters, with the addition of the Categorical DQN specific v_min, v_max and number of atoms.\n",
|
||||
"We'll also define the parameters of the exploration policy, which is epsilon greedy with epsilon starting at a value of 1.0 and decaying to 0.01 throughout 1,000,000 steps."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.agents.dqn_agent import DQNAlgorithmParameters\n",
|
||||
"from rl_coach.exploration_policies.e_greedy import EGreedyParameters\n",
|
||||
"from rl_coach.schedules import LinearSchedule\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CategoricalDQNAlgorithmParameters(DQNAlgorithmParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.v_min = -10.0\n",
|
||||
" self.v_max = 10.0\n",
|
||||
" self.atoms = 51\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CategoricalDQNExplorationParameters(EGreedyParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)\n",
|
||||
" self.evaluation_epsilon = 0.001 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's define the agent parameters class which contains all the parameters to be used by the agent - the network, algorithm and exploration parameters that we defined above, and also the parameters of the memory module to be used, which is experience replay in this case."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.agents.value_optimization_agent import ValueOptimizationAgent\n",
|
||||
"from rl_coach.base_parameters import AgentParameters\n",
|
||||
"from rl_coach.core_types import StateType\n",
|
||||
"from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class CategoricalDQNAgentParameters(AgentParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__(algorithm=CategoricalDQNAlgorithmParameters(),\n",
|
||||
" exploration=CategoricalDQNExplorationParameters(),\n",
|
||||
" memory=ExperienceReplayParameters(),\n",
|
||||
" networks={\"main\": CategoricalDQNNetworkParameters()})\n",
|
||||
"\n",
|
||||
" @property\n",
|
||||
" def path(self):\n",
|
||||
" return 'agents.categorical_dqn_agent:CategoricalDQNAgent'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The last step is to define the agent itself - ```CategoricalDQNAgent``` - which is a type of value optimization agent so it will inherit the ```ValueOptimizationAgent``` class. Our agent will implement the ```learn_from_batch``` function which updates the agent's networks according to an input batch of transitions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from typing import Union\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Categorical Deep Q Network - https://arxiv.org/pdf/1707.06887.pdf\n",
|
||||
"class CategoricalDQNAgent(ValueOptimizationAgent):\n",
|
||||
" def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):\n",
|
||||
" super().__init__(agent_parameters, parent)\n",
|
||||
" self.z_values = np.linspace(self.ap.algorithm.v_min, self.ap.algorithm.v_max, self.ap.algorithm.atoms)\n",
|
||||
"\n",
|
||||
" def distribution_prediction_to_q_values(self, prediction):\n",
|
||||
" return np.dot(prediction, self.z_values)\n",
|
||||
"\n",
|
||||
" # prediction's format is (batch,actions,atoms)\n",
|
||||
" def get_all_q_values_for_states(self, states: StateType):\n",
|
||||
" prediction = self.get_prediction(states)\n",
|
||||
" return self.distribution_prediction_to_q_values(prediction)\n",
|
||||
"\n",
|
||||
" def learn_from_batch(self, batch):\n",
|
||||
" network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()\n",
|
||||
"\n",
|
||||
" # for the action we actually took, the error is calculated by the atoms distribution\n",
|
||||
" # for all other actions, the error is 0\n",
|
||||
" distributed_q_st_plus_1, TD_targets = self.networks['main'].parallel_prediction([\n",
|
||||
" (self.networks['main'].target_network, batch.next_states(network_keys)),\n",
|
||||
" (self.networks['main'].online_network, batch.states(network_keys))\n",
|
||||
" ])\n",
|
||||
"\n",
|
||||
" # only update the action that we have actually done in this transition\n",
|
||||
" target_actions = np.argmax(self.distribution_prediction_to_q_values(distributed_q_st_plus_1), axis=1)\n",
|
||||
" m = np.zeros((self.ap.network_wrappers['main'].batch_size, self.z_values.size))\n",
|
||||
"\n",
|
||||
" batches = np.arange(self.ap.network_wrappers['main'].batch_size)\n",
|
||||
" for j in range(self.z_values.size):\n",
|
||||
" tzj = np.fmax(np.fmin(batch.rewards() +\n",
|
||||
" (1.0 - batch.game_overs()) * self.ap.algorithm.discount * self.z_values[j],\n",
|
||||
" self.z_values[self.z_values.size - 1]),\n",
|
||||
" self.z_values[0])\n",
|
||||
" bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])\n",
|
||||
" u = (np.ceil(bj)).astype(int)\n",
|
||||
" l = (np.floor(bj)).astype(int)\n",
|
||||
" m[batches, l] = m[batches, l] + (distributed_q_st_plus_1[batches, target_actions, j] * (u - bj))\n",
|
||||
" m[batches, u] = m[batches, u] + (distributed_q_st_plus_1[batches, target_actions, j] * (bj - l))\n",
|
||||
" # total_loss = cross entropy between actual result above and predicted result for the given action\n",
|
||||
" TD_targets[batches, batch.actions()] = m\n",
|
||||
"\n",
|
||||
" result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets)\n",
|
||||
" total_loss, losses, unclipped_grads = result[:3]\n",
|
||||
"\n",
|
||||
" return total_loss, losses, unclipped_grads"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Preset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The new preset will be defined in a new file - ```presets/atari_categorical_dqn.py```.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"First - let's define the agent parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAgentParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"agent_params = CategoricalDQNAgentParameters()\n",
|
||||
"agent_params.network_wrappers['main'].learning_rate = 0.00025"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Environment parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4\n",
|
||||
"from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"env_params = Atari()\n",
|
||||
"env_params.level = SingleLevelSelection(atari_deterministic_v4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Schedule and visualization parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.graph_managers.graph_manager import ScheduleParameters\n",
|
||||
"from rl_coach.core_types import EnvironmentSteps, RunPhase\n",
|
||||
"from rl_coach.base_parameters import VisualizationParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"schedule_params = ScheduleParameters()\n",
|
||||
"schedule_params.improve_steps = EnvironmentSteps(50000000)\n",
|
||||
"schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000)\n",
|
||||
"schedule_params.evaluation_steps = EnvironmentSteps(135000)\n",
|
||||
"schedule_params.heatup_steps = EnvironmentSteps(50000)\n",
|
||||
"\n",
|
||||
"vis_params = VisualizationParameters()\n",
|
||||
"vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]\n",
|
||||
"vis_params.dump_mp4 = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Connecting all the dots together - we'll define a graph manager with the Categorial DQN agent parameters, the Atari environment parameters, and the scheduling and visualization parameters defined above"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,\n",
|
||||
" schedule_params=schedule_params, vis_params=vis_params)\n",
|
||||
"graph_manager.env_params.level.select('breakout')\n",
|
||||
"graph_manager.visualization_parameters.render = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Running the Preset\n",
|
||||
"(this is normally done from command line by running ```python coach.py -p atari_categorical_dqn ... ```)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.base_parameters import TaskParameters, Frameworks\n",
|
||||
"\n",
|
||||
"log_path = '../experiments/atari_categorical_dqn'\n",
|
||||
"if not os.path.exists(log_path):\n",
|
||||
" os.makedirs(log_path)\n",
|
||||
" \n",
|
||||
"task_parameters = TaskParameters(framework_type=\"tensorflow\", \n",
|
||||
" evaluate_only=False,\n",
|
||||
" experiment_path=log_path)\n",
|
||||
"\n",
|
||||
"task_parameters.__dict__['save_checkpoint_secs'] = None\n",
|
||||
"\n",
|
||||
"graph_manager.create_graph(task_parameters)\n",
|
||||
"\n",
|
||||
"# let the adventure begin\n",
|
||||
"graph_manager.improve()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
386
tutorials/2. Adding an Environment.ipynb
Normal file
386
tutorials/2. Adding an Environment.ipynb
Normal file
@@ -0,0 +1,386 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we'll add the DeepMind Control Suite environment to Coach, and create a preset that trains the DDPG agent on the new environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Setup\n",
|
||||
"First, follow the installation instructions here: https://github.com/deepmind/dm_control#installation-and-requirements. \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Make sure your ```LD_LIBRARY_PATH``` contains the path to the GLEW and LGFW libraries (https://github.com/openai/mujoco-py/issues/110).\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"In addition, Mujoco rendering might need to be disabled (https://github.com/deepmind/dm_control/issues/20)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.environ['DISABLE_MUJOCO_RENDERING'] = '1'\n",
|
||||
"\n",
|
||||
"import sys\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Environment Wrapper\n",
|
||||
"\n",
|
||||
"To integrate an environment with Coach, we need to implement an environment wrapper which is placed under the environments folder. In our case, we'll implement the ```control_suite_environment.py``` file.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"We'll start with some helper classes - ```ObservationType``` and ```ControlSuiteEnvironmentParameters```."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from enum import Enum\n",
|
||||
"from dm_control import suite\n",
|
||||
"from rl_coach.environments.environment import Environment, EnvironmentParameters, LevelSelection\n",
|
||||
"from rl_coach.filters.filter import NoInputFilter, NoOutputFilter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class ObservationType(Enum):\n",
|
||||
" Measurements = 1\n",
|
||||
" Image = 2\n",
|
||||
" Image_and_Measurements = 3\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Parameters\n",
|
||||
"class ControlSuiteEnvironmentParameters(EnvironmentParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.observation_type = ObservationType.Measurements\n",
|
||||
" self.default_input_filter = ControlSuiteInputFilter\n",
|
||||
" self.default_output_filter = ControlSuiteOutputFilter\n",
|
||||
"\n",
|
||||
" @property\n",
|
||||
" def path(self):\n",
|
||||
" return 'environments.control_suite_environment:ControlSuiteEnvironment'\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"ControlSuite Environment Components\n",
|
||||
"\"\"\"\n",
|
||||
"ControlSuiteInputFilter = NoInputFilter()\n",
|
||||
"ControlSuiteOutputFilter = NoOutputFilter()\n",
|
||||
"\n",
|
||||
"control_suite_envs = {':'.join(env): ':'.join(env) for env in suite.BENCHMARKING}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's define the control suite's environment wrapper class.\n",
|
||||
"\n",
|
||||
"In the ```__init__``` function we'll load and initialize the environment, and the internal state and action space members which will make sure the states and actions are within their allowed limits."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import random\n",
|
||||
"from typing import Union\n",
|
||||
"from rl_coach.base_parameters import VisualizationParameters\n",
|
||||
"from rl_coach.spaces import BoxActionSpace, ImageObservationSpace, VectorObservationSpace, StateSpace\n",
|
||||
"from dm_control.suite.wrappers import pixels\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Environment\n",
|
||||
"class ControlSuiteEnvironment(Environment):\n",
|
||||
" def __init__(self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters,\n",
|
||||
" seed: Union[None, int]=None, human_control: bool=False,\n",
|
||||
" observation_type: ObservationType=ObservationType.Measurements,\n",
|
||||
" custom_reward_threshold: Union[int, float]=None, **kwargs):\n",
|
||||
" super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters)\n",
|
||||
"\n",
|
||||
" self.observation_type = observation_type\n",
|
||||
"\n",
|
||||
" # load and initialize environment\n",
|
||||
" domain_name, task_name = self.env_id.split(\":\")\n",
|
||||
" self.env = suite.load(domain_name=domain_name, task_name=task_name)\n",
|
||||
"\n",
|
||||
" if observation_type != ObservationType.Measurements:\n",
|
||||
" self.env = pixels.Wrapper(self.env, pixels_only=observation_type == ObservationType.Image)\n",
|
||||
"\n",
|
||||
" # seed\n",
|
||||
" if self.seed is not None:\n",
|
||||
" np.random.seed(self.seed)\n",
|
||||
" random.seed(self.seed)\n",
|
||||
"\n",
|
||||
" self.state_space = StateSpace({})\n",
|
||||
"\n",
|
||||
" # image observations\n",
|
||||
" if observation_type != ObservationType.Measurements:\n",
|
||||
" self.state_space['pixels'] = ImageObservationSpace(shape=self.env.observation_spec()['pixels'].shape,\n",
|
||||
" high=255)\n",
|
||||
"\n",
|
||||
" # measurements observations\n",
|
||||
" if observation_type != ObservationType.Image:\n",
|
||||
" measurements_space_size = 0\n",
|
||||
" measurements_names = []\n",
|
||||
" for observation_space_name, observation_space in self.env.observation_spec().items():\n",
|
||||
" if len(observation_space.shape) == 0:\n",
|
||||
" measurements_space_size += 1\n",
|
||||
" measurements_names.append(observation_space_name)\n",
|
||||
" elif len(observation_space.shape) == 1:\n",
|
||||
" measurements_space_size += observation_space.shape[0]\n",
|
||||
" measurements_names.extend([\"{}_{}\".format(observation_space_name, i) for i in\n",
|
||||
" range(observation_space.shape[0])])\n",
|
||||
" self.state_space['measurements'] = VectorObservationSpace(shape=measurements_space_size,\n",
|
||||
" measurements_names=measurements_names)\n",
|
||||
"\n",
|
||||
" # actions\n",
|
||||
" self.action_space = BoxActionSpace(\n",
|
||||
" shape=self.env.action_spec().shape[0],\n",
|
||||
" low=self.env.action_spec().minimum,\n",
|
||||
" high=self.env.action_spec().maximum\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # initialize the state by getting a new state from the environment\n",
|
||||
" self.reset_internal_state(True)\n",
|
||||
"\n",
|
||||
" # render\n",
|
||||
" if self.is_rendered:\n",
|
||||
" image = self.get_rendered_image()\n",
|
||||
" scale = 1\n",
|
||||
" if self.human_control:\n",
|
||||
" scale = 2\n",
|
||||
" if not self.native_rendering:\n",
|
||||
" self.renderer.create_screen(image.shape[1]*scale, image.shape[0]*scale)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following functions cover the API expected from a new environment wrapper:\n",
|
||||
"\n",
|
||||
"1. ```_update_state``` - update the internal state of the wrapper (to be queried by the agent)\n",
|
||||
"2. ```_take_action``` - take an action on the environment \n",
|
||||
"3. ```_restart_environment_episode``` - restart the environment on a new episode \n",
|
||||
"4. ```get_rendered_image``` - get a rendered image of the environment in its current state"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ControlSuiteEnvironment(Environment):\n",
|
||||
" def _update_state(self):\n",
|
||||
" self.state = {}\n",
|
||||
"\n",
|
||||
" if self.observation_type != ObservationType.Measurements:\n",
|
||||
" self.pixels = self.last_result.observation['pixels']\n",
|
||||
" self.state['pixels'] = self.pixels\n",
|
||||
"\n",
|
||||
" if self.observation_type != ObservationType.Image:\n",
|
||||
" self.measurements = np.array([])\n",
|
||||
" for sub_observation in self.last_result.observation.values():\n",
|
||||
" if isinstance(sub_observation, np.ndarray) and len(sub_observation.shape) == 1:\n",
|
||||
" self.measurements = np.concatenate((self.measurements, sub_observation))\n",
|
||||
" else:\n",
|
||||
" self.measurements = np.concatenate((self.measurements, np.array([sub_observation])))\n",
|
||||
" self.state['measurements'] = self.measurements\n",
|
||||
"\n",
|
||||
" self.reward = self.last_result.reward if self.last_result.reward is not None else 0\n",
|
||||
"\n",
|
||||
" self.done = self.last_result.last()\n",
|
||||
"\n",
|
||||
" def _take_action(self, action):\n",
|
||||
" if type(self.action_space) == BoxActionSpace:\n",
|
||||
" action = self.action_space.clip_action_to_space(action)\n",
|
||||
"\n",
|
||||
" self.last_result = self.env.step(action)\n",
|
||||
"\n",
|
||||
" def _restart_environment_episode(self, force_environment_reset=False):\n",
|
||||
" self.last_result = self.env.reset()\n",
|
||||
"\n",
|
||||
" def get_rendered_image(self):\n",
|
||||
" return self.env.physics.render(camera_id=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Preset\n",
|
||||
"The new preset will be defined in a new file - ```presets\\ControlSuite_DDPG.py```. \n",
|
||||
"\n",
|
||||
"First - let's define the agent parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.agents.ddpg_agent import DDPGAgentParameters\n",
|
||||
"from rl_coach.architectures.tensorflow_components.architecture import Dense\n",
|
||||
"from rl_coach.base_parameters import VisualizationParameters, EmbedderScheme\n",
|
||||
"from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase\n",
|
||||
"from rl_coach.environments.gym_environment import MujocoInputFilter\n",
|
||||
"from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"agent_params = DDPGAgentParameters()\n",
|
||||
"agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \\\n",
|
||||
" agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')\n",
|
||||
"agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \\\n",
|
||||
" agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')\n",
|
||||
"agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'].scheme = [Dense([300])]\n",
|
||||
"agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([200])]\n",
|
||||
"agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'].scheme = [Dense([400])]\n",
|
||||
"agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])]\n",
|
||||
"agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty\n",
|
||||
"agent_params.input_filter = MujocoInputFilter()\n",
|
||||
"agent_params.input_filter.add_reward_filter(\"rescale\", RewardRescaleFilter(1/10.))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's define the environment parameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.environments.control_suite_environment import ControlSuiteEnvironmentParameters, control_suite_envs\n",
|
||||
"from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection\n",
|
||||
"\n",
|
||||
"env_params = ControlSuiteEnvironmentParameters()\n",
|
||||
"env_params.level = SingleLevelSelection(control_suite_envs)\n",
|
||||
"\n",
|
||||
"vis_params = VisualizationParameters()\n",
|
||||
"vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]\n",
|
||||
"vis_params.dump_mp4 = False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The schedule parameters will define the number of heatup steps, periodice evaluation steps, training steps between evaluations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.graph_managers.graph_manager import ScheduleParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"schedule_params = ScheduleParameters()\n",
|
||||
"schedule_params.improve_steps = TrainingSteps(10000000000)\n",
|
||||
"schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)\n",
|
||||
"schedule_params.evaluation_steps = EnvironmentEpisodes(1)\n",
|
||||
"schedule_params.heatup_steps = EnvironmentSteps(1000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Finally, we'll create and run the graph manager"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager\n",
|
||||
"from rl_coach.base_parameters import TaskParameters, Frameworks\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,\n",
|
||||
" schedule_params=schedule_params, vis_params=vis_params)\n",
|
||||
"\n",
|
||||
"graph_manager.env_params.level.select('walker:walk')\n",
|
||||
"#graph_manager.visualization_parameters.render = True\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"log_path = '../experiments/control_suite_walker_ddpg'\n",
|
||||
"if not os.path.exists(log_path):\n",
|
||||
" os.makedirs(log_path)\n",
|
||||
" \n",
|
||||
"task_parameters = TaskParameters(framework_type=\"tensorflow\", \n",
|
||||
" evaluate_only=False,\n",
|
||||
" experiment_path=log_path)\n",
|
||||
"\n",
|
||||
"task_parameters.__dict__['save_checkpoint_secs'] = None\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"graph_manager.create_graph(task_parameters)\n",
|
||||
"\n",
|
||||
"# let the adventure begin\n",
|
||||
"graph_manager.improve()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
410
tutorials/3. Implementing a Hierarchical RL Graph.ipynb
Normal file
410
tutorials/3. Implementing a Hierarchical RL Graph.ipynb
Normal file
@@ -0,0 +1,410 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In this tutorial we'll demonstrate Coach's hierarchical RL support, by building a new agent that implements the Hierarchical Actor Critic (HAC) algorithm (https://arxiv.org/pdf/1712.00948.pdf), and a preset that runs the agent on Mujoco's pendulum challenge."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, some imports. Note that HAC is based on DDPG, hence we will be importing the relevant classes. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"module_path = os.path.abspath(os.path.join('..'))\n",
|
||||
"if module_path not in sys.path:\n",
|
||||
" sys.path.append(module_path)\n",
|
||||
" sys.path.append(module_path + '/rl_coach')\n",
|
||||
" \n",
|
||||
"from typing import Union\n",
|
||||
"import numpy as np\n",
|
||||
"from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters\n",
|
||||
"from rl_coach.spaces import SpacesDefinition\n",
|
||||
"from rl_coach.core_types import RunPhase"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now let's define the HAC algorithm and agent parameters.\n",
|
||||
"\n",
|
||||
"See tutorial 1 for more details on the content of each of these classes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.sub_goal_testing_rate = 0.5\n",
|
||||
" self.time_limit = 40\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class HACDDPGAgentParameters(DDPGAgentParameters):\n",
|
||||
" def __init__(self):\n",
|
||||
" super().__init__()\n",
|
||||
" self.algorithm = DDPGAlgorithmParameters()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we'll define the agent itself - ```HACDDPGAgent``` - which subclasses the DDPG agent class. The main difference between the DDPG agent and the HACDDPGAgent is the subgoal a higher level agent defines to a lower level agent, hence the overrides of the DDPG Agent functions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class HACDDPGAgent(DDPGAgent):\n",
|
||||
" def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):\n",
|
||||
" super().__init__(agent_parameters, parent)\n",
|
||||
" self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate\n",
|
||||
" self.graph_manager = None\n",
|
||||
"\n",
|
||||
" def choose_action(self, curr_state):\n",
|
||||
" # top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal\n",
|
||||
" # testing phase\n",
|
||||
" graph_manager = self.parent_level_manager.parent_graph_manager\n",
|
||||
" if self.ap.is_a_highest_level_agent:\n",
|
||||
" graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate\n",
|
||||
"\n",
|
||||
" if self.phase == RunPhase.TRAIN:\n",
|
||||
" if graph_manager.should_test_current_sub_goal:\n",
|
||||
" self.exploration_policy.change_phase(RunPhase.TEST)\n",
|
||||
" else:\n",
|
||||
" self.exploration_policy.change_phase(self.phase)\n",
|
||||
"\n",
|
||||
" action_info = super().choose_action(curr_state)\n",
|
||||
" return action_info\n",
|
||||
"\n",
|
||||
" def update_transition_before_adding_to_replay_buffer(self, transition):\n",
|
||||
" graph_manager = self.parent_level_manager.parent_graph_manager\n",
|
||||
"\n",
|
||||
" # deal with goals given from a higher level agent\n",
|
||||
" if not self.ap.is_a_highest_level_agent:\n",
|
||||
" transition.state['desired_goal'] = self.current_hrl_goal\n",
|
||||
" transition.next_state['desired_goal'] = self.current_hrl_goal\n",
|
||||
" self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(\n",
|
||||
" self.current_hrl_goal, transition.next_state))\n",
|
||||
" goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(\n",
|
||||
" self.current_hrl_goal, transition.next_state)\n",
|
||||
" transition.reward = goal_reward\n",
|
||||
" transition.game_over = transition.game_over or sub_goal_reached\n",
|
||||
"\n",
|
||||
" # each level tests its own generated sub goals\n",
|
||||
" if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:\n",
|
||||
" _, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(\n",
|
||||
" transition.action, transition.next_state)\n",
|
||||
"\n",
|
||||
" sub_goal_is_missed = not sub_goal_reached\n",
|
||||
"\n",
|
||||
" if sub_goal_is_missed:\n",
|
||||
" transition.reward = -self.ap.algorithm.time_limit\n",
|
||||
" return transition\n",
|
||||
"\n",
|
||||
" def set_environment_parameters(self, spaces: SpacesDefinition):\n",
|
||||
" super().set_environment_parameters(spaces)\n",
|
||||
"\n",
|
||||
" if self.ap.is_a_highest_level_agent:\n",
|
||||
" # the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have\n",
|
||||
" # their GoalsSpace set to the in_action_space in agent.set_environment_parameters()\n",
|
||||
" self.spaces.goal = self.spaces.action\n",
|
||||
" self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])\n",
|
||||
"\n",
|
||||
" if not self.ap.is_a_highest_level_agent:\n",
|
||||
" self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# The Preset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Defining the top agent in the hierarchy. Note that the agent's base parameters are the same as the DDPG agent's parameters. We also define here the memory, exploration policy and network topology."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.architectures.tensorflow_components.architecture import Dense\n",
|
||||
"from rl_coach.base_parameters import VisualizationParameters, EmbeddingMergerType, EmbedderScheme, InputEmbedderParameters\n",
|
||||
"from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \\\n",
|
||||
" EpisodicHindsightExperienceReplayParameters\n",
|
||||
"from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \\\n",
|
||||
" EpisodicHRLHindsightExperienceReplayParameters\n",
|
||||
"from rl_coach.memories.memory import MemoryGranularity\n",
|
||||
"from rl_coach.spaces import GoalsSpace, ReachingGoal\n",
|
||||
"from rl_coach.exploration_policies.ou_process import OUProcessParameters\n",
|
||||
"from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"time_limit = 1000\n",
|
||||
"polar_coordinates = False\n",
|
||||
"distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])\n",
|
||||
"goals_space = GoalsSpace('achieved_goal',\n",
|
||||
" ReachingGoal(default_reward=-1, goal_reaching_reward=0,\n",
|
||||
" distance_from_goal_threshold=distance_from_goal_threshold),\n",
|
||||
" lambda goal, state: np.abs(goal - state)) # raw L1 distance\n",
|
||||
"\n",
|
||||
"top_agent_params = HACDDPGAgentParameters()\n",
|
||||
"\n",
|
||||
"# memory - Hindsight Experience Replay\n",
|
||||
"top_agent_params.memory = EpisodicHRLHindsightExperienceReplayParameters()\n",
|
||||
"top_agent_params.memory.max_size = (MemoryGranularity.Transitions, 10000000)\n",
|
||||
"top_agent_params.memory.hindsight_transitions_per_regular_transition = 3\n",
|
||||
"top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future\n",
|
||||
"top_agent_params.memory.goals_space = goals_space\n",
|
||||
"top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32)\n",
|
||||
"top_agent_params.algorithm.num_consecutive_training_steps = 40\n",
|
||||
"top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)\n",
|
||||
"\n",
|
||||
"# exploration - OU process\n",
|
||||
"top_agent_params.exploration = OUProcessParameters()\n",
|
||||
"top_agent_params.exploration.theta = 0.1\n",
|
||||
"\n",
|
||||
"# actor - note that the default middleware is overriden with 3 dense layers\n",
|
||||
"top_actor = top_agent_params.network_wrappers['actor']\n",
|
||||
"top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
|
||||
"top_actor.middleware_parameters.scheme = [Dense([64])] * 3\n",
|
||||
"top_actor.learning_rate = 0.001\n",
|
||||
"top_actor.batch_size = 4096\n",
|
||||
"\n",
|
||||
"# critic - note that the default middleware is overriden with 3 dense layers\n",
|
||||
"top_critic = top_agent_params.network_wrappers['critic']\n",
|
||||
"top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
|
||||
"top_critic.embedding_merger_type = EmbeddingMergerType.Concat\n",
|
||||
"top_critic.middleware_parameters.scheme = [Dense([64])] * 3\n",
|
||||
"top_critic.learning_rate = 0.001\n",
|
||||
"top_critic.batch_size = 4096"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The bottom agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.schedules import ConstantSchedule\n",
|
||||
"from rl_coach.exploration_policies.e_greedy import EGreedyParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"bottom_agent_params = HACDDPGAgentParameters()\n",
|
||||
"bottom_agent_params.algorithm.in_action_space = goals_space\n",
|
||||
"\n",
|
||||
"bottom_agent_params.memory = EpisodicHindsightExperienceReplayParameters()\n",
|
||||
"bottom_agent_params.memory.max_size = (MemoryGranularity.Transitions, 12000000)\n",
|
||||
"bottom_agent_params.memory.hindsight_transitions_per_regular_transition = 4\n",
|
||||
"bottom_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future\n",
|
||||
"bottom_agent_params.memory.goals_space = goals_space\n",
|
||||
"bottom_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16 * 25) # 25 episodes is one true env episode\n",
|
||||
"bottom_agent_params.algorithm.num_consecutive_training_steps = 40\n",
|
||||
"bottom_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)\n",
|
||||
"\n",
|
||||
"bottom_agent_params.exploration = EGreedyParameters()\n",
|
||||
"bottom_agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)\n",
|
||||
"bottom_agent_params.exploration.evaluation_epsilon = 0\n",
|
||||
"bottom_agent_params.exploration.continuous_exploration_policy_parameters = OUProcessParameters()\n",
|
||||
"bottom_agent_params.exploration.continuous_exploration_policy_parameters.theta = 0.1\n",
|
||||
"\n",
|
||||
"# actor\n",
|
||||
"bottom_actor = bottom_agent_params.network_wrappers['actor']\n",
|
||||
"bottom_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
|
||||
"bottom_actor.middleware_parameters.scheme = [Dense([64])] * 3\n",
|
||||
"bottom_actor.learning_rate = 0.001\n",
|
||||
"bottom_actor.batch_size = 4096\n",
|
||||
"\n",
|
||||
"# critic\n",
|
||||
"bottom_critic = bottom_agent_params.network_wrappers['critic']\n",
|
||||
"bottom_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
|
||||
" 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
|
||||
"bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat\n",
|
||||
"bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3\n",
|
||||
"bottom_critic.learning_rate = 0.001\n",
|
||||
"bottom_critic.batch_size = 4096"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we define the parameters of all the agents in the hierarchy from top to bottom"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agents_params = [top_agent_params, bottom_agent_params]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Define the environment, visualization and schedule parameters. The schedule parameters refer to the top level agent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from rl_coach.environments.gym_environment import Mujoco\n",
|
||||
"from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod\n",
|
||||
"from rl_coach.graph_managers.hrl_graph_manager import HRLGraphManager\n",
|
||||
"from rl_coach.graph_managers.graph_manager import ScheduleParameters\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"env_params = Mujoco()\n",
|
||||
"env_params.level = \"rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals\"\n",
|
||||
"env_params.additional_simulator_parameters = {\"time_limit\": time_limit,\n",
|
||||
" \"random_goals_instead_of_standing_goal\": False,\n",
|
||||
" \"polar_coordinates\": polar_coordinates,\n",
|
||||
" \"goal_reaching_thresholds\": distance_from_goal_threshold}\n",
|
||||
"env_params.frame_skip = 10\n",
|
||||
"env_params.custom_reward_threshold = -time_limit + 1\n",
|
||||
"\n",
|
||||
"vis_params = VisualizationParameters()\n",
|
||||
"vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]\n",
|
||||
"vis_params.dump_mp4 = False\n",
|
||||
"vis_params.native_rendering = False\n",
|
||||
"\n",
|
||||
"schedule_params = ScheduleParameters()\n",
|
||||
"schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64) # 40 epochs\n",
|
||||
"schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(4 * 64) # 4 small batches of 64 episodes\n",
|
||||
"schedule_params.evaluation_steps = EnvironmentEpisodes(64)\n",
|
||||
"schedule_params.heatup_steps = EnvironmentSteps(0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Lastly, we create a ```HRLGraphManager``` that will execute the hierarchical agent we defined according to the parameters. \n",
|
||||
"\n",
|
||||
"Note that the bottom level agent will run 40 steps on each single step of the top level agent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"graph_manager = HRLGraphManager(agents_params=agents_params, env_params=env_params,\n",
|
||||
" schedule_params=schedule_params, vis_params=vis_params,\n",
|
||||
" consecutive_steps_to_run_each_level=EnvironmentSteps(40))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Running the Preset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from base_parameters import TaskParameters, Frameworks\n",
|
||||
"\n",
|
||||
"log_path = '../experiments/pendulum_hac'\n",
|
||||
"if not os.path.exists(log_path):\n",
|
||||
" os.makedirs(log_path)\n",
|
||||
" \n",
|
||||
"task_parameters = TaskParameters(framework_type=\"tensorflow\", \n",
|
||||
" evaluate_only=False,\n",
|
||||
" experiment_path=log_path)\n",
|
||||
"\n",
|
||||
"task_parameters.__dict__['save_checkpoint_secs'] = None\n",
|
||||
"task_parameters.__dict__['verbosity'] = 'low'\n",
|
||||
"\n",
|
||||
"graph_manager.create_graph(task_parameters)\n",
|
||||
"\n",
|
||||
"graph_manager.improve()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user