1
0
mirror of https://github.com/gryf/coach.git synced 2025-12-17 11:10:20 +01:00
Files
coach/rl_coach/environments/mujoco/pendulum_with_goals.py
2018-08-27 10:54:11 +03:00

202 lines
7.4 KiB
Python

#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import gym
import numpy as np
from gym import spaces
from gym.envs.registration import EnvSpec
from mujoco_py import load_model_from_path, MjSim, MjViewer, MjRenderContextOffscreen
class PendulumWithGoals(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30
}
def __init__(self, goal_reaching_thresholds=np.array([0.075, 0.075, 0.75]),
goal_not_reached_penalty=-1, goal_reached_reward=0, terminate_on_goal_reaching=True,
time_limit=1000, frameskip=1, random_goals_instead_of_standing_goal=False,
polar_coordinates: bool=False):
super().__init__()
dir = os.path.dirname(__file__)
model = load_model_from_path(dir + "/pendulum_with_goals.xml")
self.sim = MjSim(model)
self.viewer = None
self.rgb_viewer = None
self.frameskip = frameskip
self.goal = None
self.goal_reaching_thresholds = goal_reaching_thresholds
self.goal_not_reached_penalty = goal_not_reached_penalty
self.goal_reached_reward = goal_reached_reward
self.terminate_on_goal_reaching = terminate_on_goal_reaching
self.time_limit = time_limit
self.current_episode_steps_counter = 0
self.random_goals_instead_of_standing_goal = random_goals_instead_of_standing_goal
self.polar_coordinates = polar_coordinates
# spaces definition
self.action_space = spaces.Box(low=-self.sim.model.actuator_ctrlrange[:, 1],
high=self.sim.model.actuator_ctrlrange[:, 1],
dtype=np.float32)
if self.polar_coordinates:
self.observation_space = spaces.Dict({
"observation": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32),
"desired_goal": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32),
"achieved_goal": spaces.Box(low=np.array([-np.pi, -15]),
high=np.array([np.pi, 15]),
dtype=np.float32)
})
else:
self.observation_space = spaces.Dict({
"observation": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32),
"desired_goal": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32),
"achieved_goal": spaces.Box(low=np.array([-1, -1, -15]),
high=np.array([1, 1, 15]),
dtype=np.float32)
})
self.spec = EnvSpec('PendulumWithGoals-v0')
self.spec.reward_threshold = self.goal_not_reached_penalty * self.time_limit
self.reset()
def _goal_reached(self):
observation = self._get_obs()
if np.any(np.abs(observation['achieved_goal'] - observation['desired_goal']) > self.goal_reaching_thresholds):
return False
else:
return True
def _terminate(self):
if (self._goal_reached() and self.terminate_on_goal_reaching) or \
self.current_episode_steps_counter >= self.time_limit:
return True
else:
return False
def _reward(self):
if self._goal_reached():
return self.goal_reached_reward
else:
return self.goal_not_reached_penalty
def step(self, action):
self.sim.data.ctrl[:] = action
for _ in range(self.frameskip):
self.sim.step()
self.current_episode_steps_counter += 1
state = self._get_obs()
# visualize the angular velocities
state_velocity = np.copy(state['observation'][-1] / 20)
goal_velocity = self.goal[-1] / 20
self.sim.model.site_size[2] = np.array([0.01, 0.01, state_velocity])
self.sim.data.mocap_pos[2] = np.array([0.85, 0, 0.75 + state_velocity])
self.sim.model.site_size[3] = np.array([0.01, 0.01, goal_velocity])
self.sim.data.mocap_pos[3] = np.array([1.15, 0, 0.75 + goal_velocity])
return state, self._reward(), self._terminate(), {}
def _get_obs(self):
"""
y
^
|____
| /
| /
|~/
|/
--------> x
"""
# observation
angle = self.sim.data.qpos
angular_velocity = self.sim.data.qvel
if self.polar_coordinates:
observation = np.concatenate([angle - np.pi, angular_velocity])
else:
x = np.sin(angle)
y = np.cos(angle) # qpos is the angle relative to a standing pole
observation = np.concatenate([x, y, angular_velocity])
return {
"observation": observation,
"desired_goal": self.goal,
"achieved_goal": observation
}
def reset(self):
self.current_episode_steps_counter = 0
# set initial state
angle = np.random.uniform(np.pi / 4, 7 * np.pi / 4)
angular_velocity = np.random.uniform(-0.05, 0.05)
self.sim.data.qpos[0] = angle
self.sim.data.qvel[0] = angular_velocity
self.sim.step()
# goal
if self.random_goals_instead_of_standing_goal:
angle_target = np.random.uniform(-np.pi / 8, np.pi / 8)
angular_velocity_target = np.random.uniform(-0.2, 0.2)
else:
angle_target = 0
angular_velocity_target = 0
# convert target values to goal
x_target = np.sin(angle_target)
y_target = np.cos(angle_target)
if self.polar_coordinates:
self.goal = np.array([angle_target - np.pi, angular_velocity_target])
else:
self.goal = np.array([x_target, y_target, angular_velocity_target])
# visualize the goal
self.sim.data.mocap_pos[0] = [x_target, 0, y_target]
return self._get_obs()
def render(self, mode='human', close=False):
if mode == 'human':
if self.viewer is None:
self.viewer = MjViewer(self.sim)
self.viewer.render()
elif mode == 'rgb_array':
if self.rgb_viewer is None:
self.rgb_viewer = MjRenderContextOffscreen(self.sim, 0)
self.rgb_viewer.render(500, 500)
# window size used for old mujoco-py:
data = self.rgb_viewer.read_pixels(500, 500, depth=False)
# original image is upside-down, so flip it
return data[::-1, :, :]