Source code for coax._core.stochastic_q

from gymnasium.spaces import Box

from ..utils import default_preprocessor
from ..proba_dists import DiscretizedIntervalDist, EmpiricalQuantileDist
from ..value_transforms import ValueTransform
from .base_stochastic_func_type1 import BaseStochasticFuncType1


__all__ = (
    'StochasticQ',
)


[docs]class StochasticQ(BaseStochasticFuncType1):
    r"""

    A q-function :math:`q(s,a)`, represented by a stochastic function
    :math:`\mathbb{P}_\theta(G_t|S_t=s,A_t=a)`.

    Parameters
    ----------
    func : function

        A Haiku-style function that specifies the forward pass.

    env : gymnasium.Env

        The gymnasium-style environment. This is used to validate the input/output structure of
        ``func``.

    value_range : tuple of floats, optional

        A pair of floats :code:`(min_value, max_value)`. If no :code:`value_range` is given,
        :code:`num_bins` is the number of bins of the quantile function as in
        `IQN <https://arxiv.org/abs/1806.06923>`_ or `QR-DQN <https://arxiv.org/abs/1710.10044>`_.

    num_bins : int, optional

        If :code:`value_range` is given: The space of rewards is discretized in :code:`num_bins`
        equal sized bins. We use the default setting of 51 as suggested in the
        `Distributional RL <https://arxiv.org/abs/1707.06887>`_ paper.

        Else: The number of fractions of the quantile function of the rewards is defined by
        :code:`num_bins` as in `IQN <https://arxiv.org/abs/1806.06923>`_ or
        `QR-DQN <https://arxiv.org/abs/1710.10044>`_.

    observation_preprocessor : function, optional

        Turns a single observation into a batch of observations in a form that is convenient for
        feeding into :code:`func`. If left unspecified, this defaults to
        :func:`default_preprocessor(env.observation_space) <coax.utils.default_preprocessor>`.

    action_preprocessor : function, optional

        Turns a single action into a batch of actions in a form that is convenient for feeding into
        :code:`func`. If left unspecified, this defaults
        :func:`default_preprocessor(env.action_space) <coax.utils.default_preprocessor>`.

    value_transform : ValueTransform or pair of funcs, optional

        If provided, the target for the underlying function approximator is transformed:

        .. math::

            \tilde{G}_t\ =\ f(G_t)

        This means that calling the function involves undoing this transformation using its inverse
        :math:`f^{-1}`. The functions :math:`f` and :math:`f^{-1}` are given by
        ``value_transform.transform_func`` and ``value_transform.inverse_func``, respectively. Note
        that a ValueTransform is just a glorified pair of functions, i.e. passing
        ``value_transform=(func, inverse_func)`` works just as well.

    random_seed : int, optional

        Seed for pseudo-random number generators.

    """

    def __init__(
            self, func, env, value_range=None, num_bins=51,
            observation_preprocessor=None, action_preprocessor=None, value_transform=None,
            random_seed=None):

        self.value_transform = value_transform
        proba_dist = self._get_proba_dist(value_transform, num_bins, value_range)

        # set defaults
        if observation_preprocessor is None:
            observation_preprocessor = default_preprocessor(env.observation_space)
        if action_preprocessor is None:
            action_preprocessor = default_preprocessor(env.action_space)
        if self.value_transform is None:
            self.value_transform = ValueTransform(lambda x: x, lambda x: x)
        if not isinstance(self.value_transform, ValueTransform):
            self.value_transform = ValueTransform(*value_transform)

        super().__init__(
            func=func,
            observation_space=env.observation_space,
            action_space=env.action_space,
            observation_preprocessor=observation_preprocessor,
            action_preprocessor=action_preprocessor,
            proba_dist=proba_dist,
            random_seed=random_seed)

    @property
    def num_bins(self):
        return self.proba_dist.space.n

[docs]    @classmethod
    def example_data(
            cls, env, value_range, num_bins=51,
            observation_preprocessor=None, action_preprocessor=None, value_transform=None,
            batch_size=1, random_seed=None):

        value_range = cls._check_value_range(value_range)
        proba_dist = cls._get_proba_dist(value_transform, num_bins, value_range)

        if observation_preprocessor is None:
            observation_preprocessor = default_preprocessor(env.observation_space)
        if action_preprocessor is None:
            action_preprocessor = default_preprocessor(env.action_space)

        return super().example_data(
            env=env,
            observation_preprocessor=observation_preprocessor,
            action_preprocessor=action_preprocessor,
            proba_dist=proba_dist,
            batch_size=batch_size,
            random_seed=random_seed)

[docs]    def __call__(self, s, a=None, return_logp=False):
        r"""

        Sample a value.

        Parameters
        ----------
        s : state observation

            A single state observation :math:`s`.

        a : action, optional

            A single action :math:`a`. This is *required* if the actions space is non-discrete.

        return_logp : bool, optional

            Whether to return the log-propensity associated with the sampled output value.

        Returns
        -------
        value : float or list thereof

            Depending on whether :code:`a` is provided, this either returns a single value or a list
            of :math:`n` values, one for each discrete action.

        logp : non-positive float or list thereof, optional

            The log-propensity associated with the sampled output value. This is only returned if we
            set ``return_logp=True``. Depending on whether :code:`a` is provided, this is either a
            single float or a list of :math:`n` floats, one for each discrete action.

        """
        return super().__call__(s, a=a, return_logp=return_logp)

[docs]    def mean(self, s, a=None):
        r"""

        Get the mean value.

        Parameters
        ----------
        s : state observation

            A single state observation :math:`s`.

        a : action, optional

            A single action :math:`a`. This is *required* if the actions space is non-discrete.

        Returns
        -------
        value : float or list thereof

            Depending on whether :code:`a` is provided, this either returns a single value or a list
            of :math:`n` values, one for each discrete action.

        """
        return super().mean(s, a=a)

[docs]    def mode(self, s, a=None):
        r"""

        Get the most probable value.

        Parameters
        ----------
        s : state observation

            A single state observation :math:`s`.

        a : action, optional

            A single action :math:`a`. This is *required* if the actions space is non-discrete.

        Returns
        -------
        value : float or list thereof

            Depending on whether :code:`a` is provided, this either returns a single value or a list
            of :math:`n` values, one for each discrete action.

        """
        return super().mode(s, a=a)

[docs]    def dist_params(self, s, a=None):
        r"""

        Get the parameters of the underlying (conditional) probability distribution.

        Parameters
        ----------
        s : state observation

            A single state observation :math:`s`.

        a : action, optional

            A single action :math:`a`. This is *required* if the actions space is non-discrete.

        Returns
        -------
        dist_params : dict or list of dicts

            Depending on whether :code:`a` is provided, this either returns a single dist-params
            dict or a list of :math:`n` such dicts, one for each discrete action.

        """
        return super().dist_params(s, a=a)

    @staticmethod
    def _get_proba_dist(value_transform, num_bins, value_range):
        if value_range is not None:
            if value_transform is not None:
                f, _ = value_transform
                value_range = f(value_range[0]), f(value_range[1])
            reward_space = Box(*value_range, shape=())
            return DiscretizedIntervalDist(reward_space, num_bins)
        else:
            return EmpiricalQuantileDist(num_quantiles=num_bins)

    @staticmethod
    def _check_value_range(value_range):
        if not (isinstance(value_range, (tuple, list))
                and len(value_range) == 2
                and isinstance(value_range[0], (int, float))
                and isinstance(value_range[1], (int, float))
                and value_range[0] < value_range[1]):
            raise TypeError("value_range is not a valid pair tuple of floats: (low, high)")
        return float(value_range[0]), float(value_range[1])