Skip to content

rl

This namespace provides various reinforcement learning utilities.

ActClipWrapperModule (Module)

Source code in evotorch/neuroevolution/net/rl.py
class ActClipWrapperModule(nn.Module):
    def __init__(self, wrapped_module: nn.Module, obs_space: Box):
        super().__init__()

        device = device_of_module(wrapped_module)

        if not isinstance(obs_space, Box):
            raise TypeError(f"Unrecognized observation space: {obs_space}")

        self.wrapped_module = wrapped_module
        self.register_buffer("_low", torch.from_numpy(obs_space.low).to(device))
        self.register_buffer("_high", torch.from_numpy(obs_space.high).to(device))

    def forward(self, x: torch.Tensor, h: Any = None) -> Union[torch.Tensor, tuple]:
        if h is None:
            result = self.wrapped_module(x)
        else:
            result = self.wrapped_module(x, h)

        if isinstance(result, tuple):
            x, h = result
            got_h = True
        else:
            x = result
            h = None
            got_h = False

        x = torch.max(x, self._low)
        x = torch.min(x, self._high)

        if got_h:
            return x, h
        else:
            return x

forward(self, x, h=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in evotorch/neuroevolution/net/rl.py
def forward(self, x: torch.Tensor, h: Any = None) -> Union[torch.Tensor, tuple]:
    if h is None:
        result = self.wrapped_module(x)
    else:
        result = self.wrapped_module(x, h)

    if isinstance(result, tuple):
        x, h = result
        got_h = True
    else:
        x = result
        h = None
        got_h = False

    x = torch.max(x, self._low)
    x = torch.min(x, self._high)

    if got_h:
        return x, h
    else:
        return x

AliveBonusScheduleWrapper (Wrapper)

A Wrapper which awards the agent for being alive in a scheduled manner This wrapper is meant to be used for non-vectorized environments.

Source code in evotorch/neuroevolution/net/rl.py
class AliveBonusScheduleWrapper(gym.Wrapper):
    """
    A Wrapper which awards the agent for being alive in a scheduled manner
    This wrapper is meant to be used for non-vectorized environments.
    """

    def __init__(self, env: gym.Env, alive_bonus_schedule: tuple, **kwargs):
        """
        `__init__(...)`: Initialize the AliveBonusScheduleWrapper.

        Args:
            env: Environment to wrap.
            alive_bonus_schedule: If given as a tuple `(t, b)`, an alive
                bonus `b` will be added onto all the rewards beyond the
                timestep `t`.
                If given as a tuple `(t0, t1, b)`, a partial (linearly
                increasing towards `b`) alive bonus will be added onto
                all the rewards between the timesteps `t0` and `t1`,
                and a full alive bonus (which equals to `b`) will be added
                onto all the rewards beyond the timestep `t1`.
            kwargs: Expected in the form of additional keyword arguments,
                these will be passed to the initialization method of the
                superclass.
        """
        super().__init__(env, **kwargs)
        self.__t: Optional[int] = None

        if len(alive_bonus_schedule) == 3:
            self.__t0, self.__t1, self.__bonus = (
                int(alive_bonus_schedule[0]),
                int(alive_bonus_schedule[1]),
                float(alive_bonus_schedule[2]),
            )
        elif len(alive_bonus_schedule) == 2:
            self.__t0, self.__t1, self.__bonus = (
                int(alive_bonus_schedule[0]),
                int(alive_bonus_schedule[0]),
                float(alive_bonus_schedule[1]),
            )
        else:
            raise ValueError(
                f"The argument `alive_bonus_schedule` was expected to have 2 or 3 elements."
                f" However, its value is {repr(alive_bonus_schedule)} (having {len(alive_bonus_schedule)} elements)."
            )

        if self.__t1 > self.__t0:
            self.__gap = self.__t1 - self.__t0
        else:
            self.__gap = None

    def reset(self, *args, **kwargs):
        self.__t = 0
        return self.env.reset(*args, **kwargs)

    def step(self, action) -> tuple:
        step_result = self.env.step(action)
        self.__t += 1

        observation = step_result[0]
        reward = step_result[1]
        rest = step_result[2:]

        if self.__t >= self.__t1:
            reward = reward + self.__bonus
        elif (self.__gap is not None) and (self.__t >= self.__t0):
            reward = reward + ((self.__t - self.__t0) / self.__gap) * self.__bonus

        return (observation, reward) + rest

__init__(self, env, alive_bonus_schedule, **kwargs) special

__init__(...): Initialize the AliveBonusScheduleWrapper.

Parameters:

Name Type Description Default
env Env

Environment to wrap.

required
alive_bonus_schedule tuple

If given as a tuple (t, b), an alive bonus b will be added onto all the rewards beyond the timestep t. If given as a tuple (t0, t1, b), a partial (linearly increasing towards b) alive bonus will be added onto all the rewards between the timesteps t0 and t1, and a full alive bonus (which equals to b) will be added onto all the rewards beyond the timestep t1.

required
kwargs

Expected in the form of additional keyword arguments, these will be passed to the initialization method of the superclass.

{}
Source code in evotorch/neuroevolution/net/rl.py
def __init__(self, env: gym.Env, alive_bonus_schedule: tuple, **kwargs):
    """
    `__init__(...)`: Initialize the AliveBonusScheduleWrapper.

    Args:
        env: Environment to wrap.
        alive_bonus_schedule: If given as a tuple `(t, b)`, an alive
            bonus `b` will be added onto all the rewards beyond the
            timestep `t`.
            If given as a tuple `(t0, t1, b)`, a partial (linearly
            increasing towards `b`) alive bonus will be added onto
            all the rewards between the timesteps `t0` and `t1`,
            and a full alive bonus (which equals to `b`) will be added
            onto all the rewards beyond the timestep `t1`.
        kwargs: Expected in the form of additional keyword arguments,
            these will be passed to the initialization method of the
            superclass.
    """
    super().__init__(env, **kwargs)
    self.__t: Optional[int] = None

    if len(alive_bonus_schedule) == 3:
        self.__t0, self.__t1, self.__bonus = (
            int(alive_bonus_schedule[0]),
            int(alive_bonus_schedule[1]),
            float(alive_bonus_schedule[2]),
        )
    elif len(alive_bonus_schedule) == 2:
        self.__t0, self.__t1, self.__bonus = (
            int(alive_bonus_schedule[0]),
            int(alive_bonus_schedule[0]),
            float(alive_bonus_schedule[1]),
        )
    else:
        raise ValueError(
            f"The argument `alive_bonus_schedule` was expected to have 2 or 3 elements."
            f" However, its value is {repr(alive_bonus_schedule)} (having {len(alive_bonus_schedule)} elements)."
        )

    if self.__t1 > self.__t0:
        self.__gap = self.__t1 - self.__t0
    else:
        self.__gap = None

reset(self, *args, **kwargs)

Resets the environment with kwargs.

Source code in evotorch/neuroevolution/net/rl.py
def reset(self, *args, **kwargs):
    self.__t = 0
    return self.env.reset(*args, **kwargs)

step(self, action)

Steps through the environment with action.

Source code in evotorch/neuroevolution/net/rl.py
def step(self, action) -> tuple:
    step_result = self.env.step(action)
    self.__t += 1

    observation = step_result[0]
    reward = step_result[1]
    rest = step_result[2:]

    if self.__t >= self.__t1:
        reward = reward + self.__bonus
    elif (self.__gap is not None) and (self.__t >= self.__t0):
        reward = reward + ((self.__t - self.__t0) / self.__gap) * self.__bonus

    return (observation, reward) + rest

ObsNormWrapperModule (Module)

Source code in evotorch/neuroevolution/net/rl.py
class ObsNormWrapperModule(nn.Module):
    def __init__(self, wrapped_module: nn.Module, rn: Union[RunningStat, RunningNorm]):
        super().__init__()

        device = device_of_module(wrapped_module)
        self.wrapped_module = wrapped_module

        with torch.no_grad():
            normalizer = deepcopy(rn.to_layer()).to(device)
        self.normalizer = normalizer

    def forward(self, x: torch.Tensor, h: Any = None) -> Union[torch.Tensor, tuple]:
        x = self.normalizer(x)

        if h is None:
            result = self.wrapped_module(x)
        else:
            result = self.wrapped_module(x, h)

        if isinstance(result, tuple):
            x, h = result
            got_h = True
        else:
            x = result
            h = None
            got_h = False

        if got_h:
            return x, h
        else:
            return x

forward(self, x, h=None)

Defines the computation performed at every call.

Should be overridden by all subclasses.

.. note:: Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Source code in evotorch/neuroevolution/net/rl.py
def forward(self, x: torch.Tensor, h: Any = None) -> Union[torch.Tensor, tuple]:
    x = self.normalizer(x)

    if h is None:
        result = self.wrapped_module(x)
    else:
        result = self.wrapped_module(x, h)

    if isinstance(result, tuple):
        x, h = result
        got_h = True
    else:
        x = result
        h = None
        got_h = False

    if got_h:
        return x, h
    else:
        return x

reset_env(env)

Reset a gym environment.

For gym 1.0, the plan is to have a reset(...) method which returns a two-element tuple (observation, info) where info is an object providing any additional information regarding the initial state of the agent. However, the old (pre 1.0) gym API (and some environments which were written with old gym compatibility in mind) has (or have) a reset(...) method which returns a single object that is the initial observation. With the assumption that the observation space of the environment is NOT tuple, this function can work with both pre-1.0 and (hopefully) after-1.0 versions of gym, and always returns the initial observation.

Please do not use this function on environments whose observation spaces or tuples, because then this function cannot distinguish between environments whose reset(...) methods return a tuple and environments whose reset(...) methods return a single observation object but that observation object is a tuple.

Parameters:

Name Type Description Default
env Env

The gym environment which will be reset.

required

Returns:

Type Description
Iterable

The initial observation

Source code in evotorch/neuroevolution/net/rl.py
def reset_env(env: gym.Env) -> Iterable:
    """
    Reset a gym environment.

    For gym 1.0, the plan is to have a `reset(...)` method which returns
    a two-element tuple `(observation, info)` where `info` is an object
    providing any additional information regarding the initial state of
    the agent. However, the old (pre 1.0) gym API (and some environments
    which were written with old gym compatibility in mind) has (or have)
    a `reset(...)` method which returns a single object that is the
    initial observation.
    With the assumption that the observation space of the environment
    is NOT tuple, this function can work with both pre-1.0 and (hopefully)
    after-1.0 versions of gym, and always returns the initial observation.

    Please do not use this function on environments whose observation
    spaces or tuples, because then this function cannot distinguish between
    environments whose `reset(...)` methods return a tuple and environments
    whose `reset(...)` methods return a single observation object but that
    observation object is a tuple.

    Args:
        env: The gym environment which will be reset.
    Returns:
        The initial observation
    """
    result = env.reset()
    if isinstance(result, tuple) and (len(result) == 2):
        result = result[0]
    return result

take_step_in_env(env, action)

Take a step in the gym environment. Taking a step means performing the action provided via the arguments.

For gym 1.0, the plan is to have a step(...) method which returns a 5-elements tuple containing observation, reward, terminated, truncated, info where terminated is a boolean indicating whether or not the episode is terminated because of the actions taken within the environment, and truncated is a boolean indicating whether or not the episode is finished because the time limit is reached. However, the old (pre 1.0) gym API (and some environments which were written with old gym compatibility in mind) has (or have) a step(...) method which returns 4 elements: observation, reward, done, info where done is a boolean indicating whether or not the episode is "done", either because of termination or because of truncation. This function can work with both pre-1.0 and (hopefully) after-1.0 versions of gym, and always returns the 4-element tuple as its result.

Parameters:

Name Type Description Default
env Env

The gym environment in which the given action will be performed.

required

Returns:

Type Description
tuple

A tuple in the form (observation, reward, done, info) where observation is the observation received after performing the action, reward is the amount of reward gained, done is a boolean value indicating whether or not the episode has ended, and info is additional information (usually as a dictionary).

Source code in evotorch/neuroevolution/net/rl.py
def take_step_in_env(env: gym.Env, action: Iterable) -> tuple:
    """
    Take a step in the gym environment.
    Taking a step means performing the action provided via the arguments.

    For gym 1.0, the plan is to have a `step(...)` method which returns a
    5-elements tuple containing `observation`, `reward`, `terminated`,
    `truncated`, `info` where `terminated` is a boolean indicating whether
    or not the episode is terminated because of the actions taken within the
    environment, and `truncated` is a boolean indicating whether or not the
    episode is finished because the time limit is reached.
    However, the old (pre 1.0) gym API (and some environments which were
    written with old gym compatibility in mind) has (or have) a `step(...)`
    method which returns 4 elements: `observation`, `reward`, `done`, `info`
    where `done` is a boolean indicating whether or not the episode is
    "done", either because of termination or because of truncation.
    This function can work with both pre-1.0 and (hopefully) after-1.0
    versions of gym, and always returns the 4-element tuple as its result.

    Args:
        env: The gym environment in which the given action will be performed.
    Returns:
        A tuple in the form `(observation, reward, done, info)` where
        `observation` is the observation received after performing the action,
        `reward` is the amount of reward gained,
        `done` is a boolean value indicating whether or not the episode has
        ended, and
        `info` is additional information (usually as a dictionary).
    """
    result = env.step(action)
    if isinstance(result, tuple):
        n = len(result)
        if n == 4:
            observation, reward, done, info = result
        elif n == 5:
            observation, reward, terminated, truncated, info = result
            done = terminated or truncated
        else:
            raise ValueError(
                f"The result of the `step(...)` method of the gym environment"
                f" was expected as a tuple of length 4 or 5."
                f" However, the received result is {repr(result)}, which is"
                f" of length {len(result)}."
            )
    else:
        raise TypeError(
            f"The result of the `step(...)` method of the gym environment"
            f" was expected as a tuple of length 4 or 5."
            f" However, the received result is {repr(result)}, which is"
            f" of type {type(result)}."
        )
    return observation, reward, done, info