Source code for coax.reward_tracing._transition

from functools import partial

import jax
import jax.numpy as jnp
import numpy as onp

from .._base.mixins import CopyMixin
from ..utils import pretty_repr


__all__ = (
    'TransitionBatch',
)


[docs]class TransitionBatch(CopyMixin):
    r"""

    A container object for a batch of MDP transitions.

    Parameters
    ----------

    S : pytree with ndarray leaves

        A batch of state observations :math:`S_t`.

    A : ndarray

        A batch of actions :math:`A_t`.

    logP : ndarray

        A batch of log-propensities :math:`\log\pi(A_t|S_t)`.

    Rn : ndarray

        A batch of partial (:math:`\gamma`-discounted) returns. For instance,
        in :math:`n`-step bootstrapping these are given by:

        .. math::

            R^{(n)}_t\ &=\ \sum_{k=0}^{n-1}\gamma^kR_{t+k} \\

        In other words, it's the part of the :math:`n`-step return *without*
        the bootstrapping term.

    In : ndarray

        A batch of bootstrap factors. For instance, in :math:`n`-step
        bootstrapping these are given by :math:`I^{(n)}_t=\gamma^n` when
        bootstrapping and :math:`I^{(n)}_t=0` otherwise. Bootstrap factors are
        used in constructing the :math:`n`-step bootstrapped target:

        .. math::

            G^{(n)}_t\ =\ R^{(n)}_t + I^{(n)}_t\,Q(S_{t+1}, A_{t+1})

    S_next : pytree with ndarray leaves

        A batch of next-state observations :math:`S_{t+n}`. This is typically
        used to contruct the TD target in :math:`n`-step bootstrapping.

    A_next : ndarray, optional

        A batch of next-actions :math:`A_{t+n}`. This is typically used to
        contruct the TD target in :math:`n`-step bootstrapping when using SARSA
        updates.

    logP_next : ndarray, optional

        A batch of log-propensities :math:`\log\pi(A_{t+n}|S_{t+n})`.

    W : ndarray, optional

        A batch of importance weights associated with the sampling procedure that generated each
        transition. For example, we need these values when we sample transitions from a
        :class:`PrioritizedReplayBuffer <coax.experience_replay.PrioritizedReplayBuffer>`.

    """
    __slots__ = ('S', 'A', 'logP', 'Rn', 'In', 'S_next',
                 'A_next', 'logP_next', 'W', 'idx', 'extra_info')

    def __init__(self, S, A, logP, Rn, In, S_next, A_next=None, logP_next=None, W=None, idx=None,
                 extra_info=None):

        self.S = S
        self.A = A
        self.logP = logP
        self.Rn = Rn
        self.In = In
        self.S_next = S_next
        self.A_next = A_next
        self.logP_next = logP_next
        self.W = onp.ones_like(Rn) if W is None else W
        self.idx = onp.arange(Rn.shape[0], dtype='int32') if idx is None else idx
        self.extra_info = extra_info

[docs]    @classmethod
    def from_single(
            cls, s, a, logp, r, done, gamma,
            s_next=None, a_next=None, logp_next=None, w=1, idx=None, extra_info=None):
        r"""

        Create a TransitionBatch (with batch_size=1) from a single transition.

        Attributes
        ----------
        s : state observation

            A single state observation :math:`S_t`.

        a : action

            A single action :math:`A_t`.

        logp : non-positive float

            The log-propensity :math:`\log\pi(A_t|S_t)`.

        r : float or array of floats

            A single reward :math:`R_t`.

        done : bool

            Whether the episode has finished.

        info : dict or None

            Some additional info about the current time step.

        s_next : state observation

            A single next-state observation :math:`S_{t+1}`.

        a_next : action

            A single next-action :math:`A_{t+1}`.

        logp_next : non-positive float

            The log-propensity :math:`\log\pi(A_{t+1}|S_{t+1})`.

        w : positive float, optional

            The importance weight associated with the sampling procedure that generated this
            transition.

        idx : int, optional

            The identifier of this particular transition.

        """

        # check types
        array = (int, float, onp.ndarray, jnp.ndarray)
        if not (isinstance(logp, array) and onp.all(logp <= 0)):
            raise TypeError(f"logp must be non-positive float(s), got: {logp}")
        if not isinstance(r, array):
            raise TypeError(f"r must be a scalar or an array, got: {r}")
        if not isinstance(done, bool):
            raise TypeError(f"done must be a bool, got: {done}")
        if not (isinstance(gamma, (float, int)) and 0 <= gamma <= 1):
            raise TypeError(f"gamma must be a float in the unit interval [0, 1], got: {gamma}")
        if not (logp_next is None or (isinstance(logp_next, array) and onp.all(logp_next <= 0))):
            raise TypeError(f"logp_next must be None or non-positive float(s), got: {logp_next}")
        if not (isinstance(w, (float, int)) and w > 0):
            raise TypeError(f"w must be a positive float, got: {w}")

        return cls(
            S=_single_to_batch(s),
            A=_single_to_batch(a),
            logP=_single_to_batch(logp),
            Rn=_single_to_batch(r),
            In=_single_to_batch(float(gamma) * (1. - bool(done))),
            S_next=_single_to_batch(s_next) if s_next is not None else None,
            A_next=_single_to_batch(a_next) if a_next is not None else None,
            logP_next=_single_to_batch(logp_next) if logp_next is not None else None,
            W=_single_to_batch(float(w)),
            idx=_single_to_batch(idx) if idx is not None else None,
            extra_info=_single_to_batch(extra_info) if extra_info is not None else None
        )

    @property
    def batch_size(self):
        return onp.shape(self.Rn)[0]

[docs]    def to_singles(self):
        r"""

        Get an iterator of single transitions.

        Returns
        -------
        transition_batches : iterator of TransitionBatch

            An iterator of :class:`TransitionBatch <coax.reward_tracing.TransitionBatch>` objects
            with ``batch_size=1``.

            **Note:** The iterator walks through the individual transitions *in reverse order*.

        """
        if self.batch_size == 1:
            yield self
            return  # break out of generator

        def lookup(i, pytree):
            s = slice(i, i + 1)  # ndim-preserving lookup
            return jax.tree_map(lambda leaf: leaf[s], pytree)

        for i in range(self.batch_size):
            yield TransitionBatch(*map(partial(lookup, i), self))

    def items(self):
        for k in self.__slots__:
            yield k, getattr(self, k)

    def _asdict(self):
        return dict(self.items())

    def __repr__(self):
        return pretty_repr(self)

    def __iter__(self):
        return (getattr(self, a) for a in self.__slots__)

    def __getitem__(self, int_or_slice):
        return tuple(self).__getitem__(int_or_slice)

    def __eq__(self, other):
        return (type(self) is type(other)) and all(
            onp.allclose(a, b) if isinstance(a, (onp.ndarray, jnp.ndarray))
            else (a is b if a is None else a == b)
            for a, b in zip(jax.tree_util.tree_leaves(self), jax.tree_util.tree_leaves(other)))


def _single_to_batch(pytree):
    # notice that we're pulling eveyrthing out of jax.numpy and into ordinary numpy land
    return jax.tree_map(lambda arr: onp.expand_dims(arr, axis=0), pytree)


jax.tree_util.register_pytree_node(
    TransitionBatch,
    lambda tn: (tuple(tn), None),
    lambda treedef, leaves: TransitionBatch(*leaves))