Spaces:
Paused
Paused
"""Core API for Environment, Wrapper, ActionWrapper, RewardWrapper and ObservationWrapper.""" | |
import sys | |
from typing import ( | |
TYPE_CHECKING, | |
Any, | |
Dict, | |
Generic, | |
List, | |
Optional, | |
SupportsFloat, | |
Tuple, | |
TypeVar, | |
Union, | |
) | |
import numpy as np | |
from gym import spaces | |
from gym.logger import warn | |
from gym.utils import seeding | |
if TYPE_CHECKING: | |
from gym.envs.registration import EnvSpec | |
if sys.version_info[0:2] == (3, 6): | |
warn( | |
"Gym minimally supports python 3.6 as the python foundation not longer supports the version, please update your version to 3.7+" | |
) | |
ObsType = TypeVar("ObsType") | |
ActType = TypeVar("ActType") | |
RenderFrame = TypeVar("RenderFrame") | |
class Env(Generic[ObsType, ActType]): | |
r"""The main OpenAI Gym class. | |
It encapsulates an environment with arbitrary behind-the-scenes dynamics. | |
An environment can be partially or fully observed. | |
The main API methods that users of this class need to know are: | |
- :meth:`step` - Takes a step in the environment using an action returning the next observation, reward, | |
if the environment terminated and observation information. | |
- :meth:`reset` - Resets the environment to an initial state, returning the initial observation and observation information. | |
- :meth:`render` - Renders the environment observation with modes depending on the output | |
- :meth:`close` - Closes the environment, important for rendering where pygame is imported | |
And set the following attributes: | |
- :attr:`action_space` - The Space object corresponding to valid actions | |
- :attr:`observation_space` - The Space object corresponding to valid observations | |
- :attr:`reward_range` - A tuple corresponding to the minimum and maximum possible rewards | |
- :attr:`spec` - An environment spec that contains the information used to initialise the environment from `gym.make` | |
- :attr:`metadata` - The metadata of the environment, i.e. render modes | |
- :attr:`np_random` - The random number generator for the environment | |
Note: a default reward range set to :math:`(-\infty,+\infty)` already exists. Set it if you want a narrower range. | |
""" | |
# Set this in SOME subclasses | |
metadata: Dict[str, Any] = {"render_modes": []} | |
# define render_mode if your environment supports rendering | |
render_mode: Optional[str] = None | |
reward_range = (-float("inf"), float("inf")) | |
spec: "EnvSpec" = None | |
# Set these in ALL subclasses | |
action_space: spaces.Space[ActType] | |
observation_space: spaces.Space[ObsType] | |
# Created | |
_np_random: Optional[np.random.Generator] = None | |
def np_random(self) -> np.random.Generator: | |
"""Returns the environment's internal :attr:`_np_random` that if not set will initialise with a random seed.""" | |
if self._np_random is None: | |
self._np_random, seed = seeding.np_random() | |
return self._np_random | |
def np_random(self, value: np.random.Generator): | |
self._np_random = value | |
def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: | |
"""Run one timestep of the environment's dynamics. | |
When end of episode is reached, you are responsible for calling :meth:`reset` to reset this environment's state. | |
Accepts an action and returns either a tuple `(observation, reward, terminated, truncated, info)`. | |
Args: | |
action (ActType): an action provided by the agent | |
Returns: | |
observation (object): this will be an element of the environment's :attr:`observation_space`. | |
This may, for instance, be a numpy array containing the positions and velocities of certain objects. | |
reward (float): The amount of reward returned as a result of taking the action. | |
terminated (bool): whether a `terminal state` (as defined under the MDP of the task) is reached. | |
In this case further step() calls could return undefined results. | |
truncated (bool): whether a truncation condition outside the scope of the MDP is satisfied. | |
Typically a timelimit, but could also be used to indicate agent physically going out of bounds. | |
Can be used to end the episode prematurely before a `terminal state` is reached. | |
info (dictionary): `info` contains auxiliary diagnostic information (helpful for debugging, learning, and logging). | |
This might, for instance, contain: metrics that describe the agent's performance state, variables that are | |
hidden from observations, or individual reward terms that are combined to produce the total reward. | |
It also can contain information that distinguishes truncation and termination, however this is deprecated in favour | |
of returning two booleans, and will be removed in a future version. | |
(deprecated) | |
done (bool): A boolean value for if the episode has ended, in which case further :meth:`step` calls will return undefined results. | |
A done signal may be emitted for different reasons: Maybe the task underlying the environment was solved successfully, | |
a certain timelimit was exceeded, or the physics simulation has entered an invalid state. | |
""" | |
raise NotImplementedError | |
def reset( | |
self, | |
*, | |
seed: Optional[int] = None, | |
options: Optional[dict] = None, | |
) -> Tuple[ObsType, dict]: | |
"""Resets the environment to an initial state and returns the initial observation. | |
This method can reset the environment's random number generator(s) if ``seed`` is an integer or | |
if the environment has not yet initialized a random number generator. | |
If the environment already has a random number generator and :meth:`reset` is called with ``seed=None``, | |
the RNG should not be reset. Moreover, :meth:`reset` should (in the typical use case) be called with an | |
integer seed right after initialization and then never again. | |
Args: | |
seed (optional int): The seed that is used to initialize the environment's PRNG. | |
If the environment does not already have a PRNG and ``seed=None`` (the default option) is passed, | |
a seed will be chosen from some source of entropy (e.g. timestamp or /dev/urandom). | |
However, if the environment already has a PRNG and ``seed=None`` is passed, the PRNG will *not* be reset. | |
If you pass an integer, the PRNG will be reset even if it already exists. | |
Usually, you want to pass an integer *right after the environment has been initialized and then never again*. | |
Please refer to the minimal example above to see this paradigm in action. | |
options (optional dict): Additional information to specify how the environment is reset (optional, | |
depending on the specific environment) | |
Returns: | |
observation (object): Observation of the initial state. This will be an element of :attr:`observation_space` | |
(typically a numpy array) and is analogous to the observation returned by :meth:`step`. | |
info (dictionary): This dictionary contains auxiliary information complementing ``observation``. It should be analogous to | |
the ``info`` returned by :meth:`step`. | |
""" | |
# Initialize the RNG if the seed is manually passed | |
if seed is not None: | |
self._np_random, seed = seeding.np_random(seed) | |
def render(self) -> Optional[Union[RenderFrame, List[RenderFrame]]]: | |
"""Compute the render frames as specified by render_mode attribute during initialization of the environment. | |
The set of supported modes varies per environment. (And some | |
third-party environments may not support rendering at all.) | |
By convention, if render_mode is: | |
- None (default): no render is computed. | |
- human: render return None. | |
The environment is continuously rendered in the current display or terminal. Usually for human consumption. | |
- rgb_array: return a single frame representing the current state of the environment. | |
A frame is a numpy.ndarray with shape (x, y, 3) representing RGB values for an x-by-y pixel image. | |
- rgb_array_list: return a list of frames representing the states of the environment since the last reset. | |
Each frame is a numpy.ndarray with shape (x, y, 3), as with `rgb_array`. | |
- ansi: Return a strings (str) or StringIO.StringIO containing a | |
terminal-style text representation for each time step. | |
The text can include newlines and ANSI escape sequences (e.g. for colors). | |
Note: | |
Make sure that your class's metadata 'render_modes' key includes | |
the list of supported modes. It's recommended to call super() | |
in implementations to use the functionality of this method. | |
""" | |
raise NotImplementedError | |
def close(self): | |
"""Override close in your subclass to perform any necessary cleanup. | |
Environments will automatically :meth:`close()` themselves when | |
garbage collected or when the program exits. | |
""" | |
pass | |
def unwrapped(self) -> "Env": | |
"""Returns the base non-wrapped environment. | |
Returns: | |
Env: The base non-wrapped gym.Env instance | |
""" | |
return self | |
def __str__(self): | |
"""Returns a string of the environment with the spec id if specified.""" | |
if self.spec is None: | |
return f"<{type(self).__name__} instance>" | |
else: | |
return f"<{type(self).__name__}<{self.spec.id}>>" | |
def __enter__(self): | |
"""Support with-statement for the environment.""" | |
return self | |
def __exit__(self, *args): | |
"""Support with-statement for the environment.""" | |
self.close() | |
# propagate exception | |
return False | |
class Wrapper(Env[ObsType, ActType]): | |
"""Wraps an environment to allow a modular transformation of the :meth:`step` and :meth:`reset` methods. | |
This class is the base class for all wrappers. The subclass could override | |
some methods to change the behavior of the original environment without touching the | |
original code. | |
Note: | |
Don't forget to call ``super().__init__(env)`` if the subclass overrides :meth:`__init__`. | |
""" | |
def __init__(self, env: Env): | |
"""Wraps an environment to allow a modular transformation of the :meth:`step` and :meth:`reset` methods. | |
Args: | |
env: The environment to wrap | |
""" | |
self.env = env | |
self._action_space: Optional[spaces.Space] = None | |
self._observation_space: Optional[spaces.Space] = None | |
self._reward_range: Optional[Tuple[SupportsFloat, SupportsFloat]] = None | |
self._metadata: Optional[dict] = None | |
def __getattr__(self, name): | |
"""Returns an attribute with ``name``, unless ``name`` starts with an underscore.""" | |
if name.startswith("_"): | |
raise AttributeError(f"accessing private attribute '{name}' is prohibited") | |
return getattr(self.env, name) | |
def spec(self): | |
"""Returns the environment specification.""" | |
return self.env.spec | |
def class_name(cls): | |
"""Returns the class name of the wrapper.""" | |
return cls.__name__ | |
def action_space(self) -> spaces.Space[ActType]: | |
"""Returns the action space of the environment.""" | |
if self._action_space is None: | |
return self.env.action_space | |
return self._action_space | |
def action_space(self, space: spaces.Space): | |
self._action_space = space | |
def observation_space(self) -> spaces.Space: | |
"""Returns the observation space of the environment.""" | |
if self._observation_space is None: | |
return self.env.observation_space | |
return self._observation_space | |
def observation_space(self, space: spaces.Space): | |
self._observation_space = space | |
def reward_range(self) -> Tuple[SupportsFloat, SupportsFloat]: | |
"""Return the reward range of the environment.""" | |
if self._reward_range is None: | |
return self.env.reward_range | |
return self._reward_range | |
def reward_range(self, value: Tuple[SupportsFloat, SupportsFloat]): | |
self._reward_range = value | |
def metadata(self) -> dict: | |
"""Returns the environment metadata.""" | |
if self._metadata is None: | |
return self.env.metadata | |
return self._metadata | |
def metadata(self, value): | |
self._metadata = value | |
def render_mode(self) -> Optional[str]: | |
"""Returns the environment render_mode.""" | |
return self.env.render_mode | |
def np_random(self) -> np.random.Generator: | |
"""Returns the environment np_random.""" | |
return self.env.np_random | |
def np_random(self, value): | |
self.env.np_random = value | |
def _np_random(self): | |
raise AttributeError( | |
"Can't access `_np_random` of a wrapper, use `.unwrapped._np_random` or `.np_random`." | |
) | |
def step(self, action: ActType) -> Tuple[ObsType, float, bool, bool, dict]: | |
"""Steps through the environment with action.""" | |
return self.env.step(action) | |
def reset(self, **kwargs) -> Tuple[ObsType, dict]: | |
"""Resets the environment with kwargs.""" | |
return self.env.reset(**kwargs) | |
def render( | |
self, *args, **kwargs | |
) -> Optional[Union[RenderFrame, List[RenderFrame]]]: | |
"""Renders the environment.""" | |
return self.env.render(*args, **kwargs) | |
def close(self): | |
"""Closes the environment.""" | |
return self.env.close() | |
def __str__(self): | |
"""Returns the wrapper name and the unwrapped environment string.""" | |
return f"<{type(self).__name__}{self.env}>" | |
def __repr__(self): | |
"""Returns the string representation of the wrapper.""" | |
return str(self) | |
def unwrapped(self) -> Env: | |
"""Returns the base environment of the wrapper.""" | |
return self.env.unwrapped | |
class ObservationWrapper(Wrapper): | |
"""Superclass of wrappers that can modify observations using :meth:`observation` for :meth:`reset` and :meth:`step`. | |
If you would like to apply a function to the observation that is returned by the base environment before | |
passing it to learning code, you can simply inherit from :class:`ObservationWrapper` and overwrite the method | |
:meth:`observation` to implement that transformation. The transformation defined in that method must be | |
defined on the base environment’s observation space. However, it may take values in a different space. | |
In that case, you need to specify the new observation space of the wrapper by setting :attr:`self.observation_space` | |
in the :meth:`__init__` method of your wrapper. | |
For example, you might have a 2D navigation task where the environment returns dictionaries as observations with | |
keys ``"agent_position"`` and ``"target_position"``. A common thing to do might be to throw away some degrees of | |
freedom and only consider the position of the target relative to the agent, i.e. | |
``observation["target_position"] - observation["agent_position"]``. For this, you could implement an | |
observation wrapper like this:: | |
class RelativePosition(gym.ObservationWrapper): | |
def __init__(self, env): | |
super().__init__(env) | |
self.observation_space = Box(shape=(2,), low=-np.inf, high=np.inf) | |
def observation(self, obs): | |
return obs["target"] - obs["agent"] | |
Among others, Gym provides the observation wrapper :class:`TimeAwareObservation`, which adds information about the | |
index of the timestep to the observation. | |
""" | |
def reset(self, **kwargs): | |
"""Resets the environment, returning a modified observation using :meth:`self.observation`.""" | |
obs, info = self.env.reset(**kwargs) | |
return self.observation(obs), info | |
def step(self, action): | |
"""Returns a modified observation using :meth:`self.observation` after calling :meth:`env.step`.""" | |
observation, reward, terminated, truncated, info = self.env.step(action) | |
return self.observation(observation), reward, terminated, truncated, info | |
def observation(self, observation): | |
"""Returns a modified observation.""" | |
raise NotImplementedError | |
class RewardWrapper(Wrapper): | |
"""Superclass of wrappers that can modify the returning reward from a step. | |
If you would like to apply a function to the reward that is returned by the base environment before | |
passing it to learning code, you can simply inherit from :class:`RewardWrapper` and overwrite the method | |
:meth:`reward` to implement that transformation. | |
This transformation might change the reward range; to specify the reward range of your wrapper, | |
you can simply define :attr:`self.reward_range` in :meth:`__init__`. | |
Let us look at an example: Sometimes (especially when we do not have control over the reward | |
because it is intrinsic), we want to clip the reward to a range to gain some numerical stability. | |
To do that, we could, for instance, implement the following wrapper:: | |
class ClipReward(gym.RewardWrapper): | |
def __init__(self, env, min_reward, max_reward): | |
super().__init__(env) | |
self.min_reward = min_reward | |
self.max_reward = max_reward | |
self.reward_range = (min_reward, max_reward) | |
def reward(self, reward): | |
return np.clip(reward, self.min_reward, self.max_reward) | |
""" | |
def step(self, action): | |
"""Modifies the reward using :meth:`self.reward` after the environment :meth:`env.step`.""" | |
observation, reward, terminated, truncated, info = self.env.step(action) | |
return observation, self.reward(reward), terminated, truncated, info | |
def reward(self, reward): | |
"""Returns a modified ``reward``.""" | |
raise NotImplementedError | |
class ActionWrapper(Wrapper): | |
"""Superclass of wrappers that can modify the action before :meth:`env.step`. | |
If you would like to apply a function to the action before passing it to the base environment, | |
you can simply inherit from :class:`ActionWrapper` and overwrite the method :meth:`action` to implement | |
that transformation. The transformation defined in that method must take values in the base environment’s | |
action space. However, its domain might differ from the original action space. | |
In that case, you need to specify the new action space of the wrapper by setting :attr:`self.action_space` in | |
the :meth:`__init__` method of your wrapper. | |
Let’s say you have an environment with action space of type :class:`gym.spaces.Box`, but you would only like | |
to use a finite subset of actions. Then, you might want to implement the following wrapper:: | |
class DiscreteActions(gym.ActionWrapper): | |
def __init__(self, env, disc_to_cont): | |
super().__init__(env) | |
self.disc_to_cont = disc_to_cont | |
self.action_space = Discrete(len(disc_to_cont)) | |
def action(self, act): | |
return self.disc_to_cont[act] | |
if __name__ == "__main__": | |
env = gym.make("LunarLanderContinuous-v2") | |
wrapped_env = DiscreteActions(env, [np.array([1,0]), np.array([-1,0]), | |
np.array([0,1]), np.array([0,-1])]) | |
print(wrapped_env.action_space) #Discrete(4) | |
Among others, Gym provides the action wrappers :class:`ClipAction` and :class:`RescaleAction`. | |
""" | |
def step(self, action): | |
"""Runs the environment :meth:`env.step` using the modified ``action`` from :meth:`self.action`.""" | |
return self.env.step(self.action(action)) | |
def action(self, action): | |
"""Returns a modified action before :meth:`env.step` is called.""" | |
raise NotImplementedError | |
def reverse_action(self, action): | |
"""Returns a reversed ``action``.""" | |
raise NotImplementedError | |