Source code for alf.algorithms.ppo_loss

# Copyright (c) 2019 Horizon Robotics. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Loss for PPO algorithm."""

import torch

import alf

from alf.algorithms.actor_critic_loss import ActorCriticLoss
from alf.utils.losses import element_wise_squared_loss
from alf.utils import value_ops


[docs]@alf.configurable
class PPOLoss(ActorCriticLoss):
    """PPO loss."""

    def __init__(self,
                 gamma=0.99,
                 td_error_loss_fn=element_wise_squared_loss,
                 td_lambda=0.95,
                 normalize_advantages=True,
                 compute_advantages_internally=False,
                 advantage_clip=None,
                 entropy_regularization=None,
                 td_loss_weight=1.0,
                 importance_ratio_clipping=0.2,
                 log_prob_clipping=0.0,
                 check_numerics=False,
                 debug_summaries=False,
                 name='PPOLoss'):
        """Implement the simplified surrogate loss in equation (9) of `Proximal
        Policy Optimization Algorithms <https://arxiv.org/abs/1707.06347>`_.

        The total loss equals to

        .. code-block:: python

            (policy_gradient_loss      # (L^{CLIP} in equation (9))
            + td_loss_weight * td_loss # (L^{VF} in equation (9))
            - entropy_regularization * entropy)

        This loss works with ``PPOAlgorithm``. The advantages and returns are
        pre-computed by ``PPOAlgorithm.preprocess()``. One known difference with
        `baselines.ppo2` is that value estimation is not clipped here, while
        `baselines.ppo2` also clipped value if it deviates from returns too
        much.

        Args:
            gamma (float|list[float]): A discount factor for future rewards. For
                multi-dim reward, this can also be a list of discounts, each
                discount applies to a reward dim.
            td_errors_loss_fn (Callable): A function for computing the TD errors
                loss. This function takes as input the target and the estimated
                Q values and returns the loss for each element of the batch.
            td_lambda (float): Lambda parameter for TD-lambda computation.
            normalize_advantages (bool): If True, normalize advantage to zero
                mean and unit variance within batch for caculating policy
                gradient.
            compute_advantages_internally (bool): Normally PPOLoss does not
                compute the adavantage and it expects the info to carry the
                already-computed advantage. If this flag is set to True, PPOLoss
                will instead compute the advantage internally without depending
                on the input info, because loading very large amount of
                experiences into GPU memory to compute advantages may not always
                be possible.
            advantage_clip (float): If set, clip advantages to :math:`[-x, x]`
            entropy_regularization (float): Coefficient for entropy
                regularization loss term.
            td_loss_weight (float): the weigt for the loss of td error.
            importance_ratio_clipping (float):  Epsilon in clipped, surrogate
                PPO objective. See the cited paper for more detail.
            log_prob_clipping (float): If >0, clipping log probs to the range
                ``(-log_prob_clipping, log_prob_clipping)`` to prevent ``inf/NaN``
                values.
            check_numerics (bool):  If true, checking for ``NaN/Inf`` values. For
                debugging only.
            name (str):

        """

        super(PPOLoss, self).__init__(
            gamma=gamma,
            td_error_loss_fn=td_error_loss_fn,
            use_gae=True,
            td_lambda=td_lambda,
            use_td_lambda_return=True,
            normalize_advantages=normalize_advantages,
            advantage_clip=advantage_clip,
            entropy_regularization=entropy_regularization,
            td_loss_weight=td_loss_weight,
            debug_summaries=debug_summaries,
            name=name)

        self._importance_ratio_clipping = importance_ratio_clipping
        self._log_prob_clipping = log_prob_clipping
        self._check_numerics = check_numerics
        self._compute_advantages_internally = compute_advantages_internally

    def _pg_loss(self, info, advantages):
        scope = alf.summary.scope(self._name)
        importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
            action_distribution=info.action_distribution,
            rollout_action_distribution=info.rollout_action_distribution,
            action=info.action,
            rollout_log_prob=info.rollout_log_prob,
            clipping_mode='double_sided',
            scope=scope,
            importance_ratio_clipping=self._importance_ratio_clipping,
            log_prob_clipping=self._log_prob_clipping,
            check_numerics=self._check_numerics,
            debug_summaries=self._debug_summaries)
        # Pessimistically choose the maximum objective value for clipped and
        # unclipped importance ratios.
        pg_objective = -importance_ratio * advantages
        pg_objective_clipped = -importance_ratio_clipped * advantages
        policy_gradient_loss = torch.max(pg_objective, pg_objective_clipped)

        if self._debug_summaries and alf.summary.should_record_summaries():
            with scope:
                alf.summary.histogram('pg_objective', pg_objective)
                alf.summary.histogram('pg_objective_clipped',
                                      pg_objective_clipped)

        if self._check_numerics:
            assert torch.all(torch.isfinite(policy_gradient_loss))

        return policy_gradient_loss

    def _calc_returns_and_advantages(self, info, value):
        if not self._compute_advantages_internally:
            return info.returns, info.advantages
        # If rollout_value is present in ``info``, we use it to compute the
        # advantage. This is mainly for algorithms like PPG where at this time
        # info.value is the newly computed value which is different from the
        # rollout time value prediction. The rollout time value prediction is
        # preserved in info.rollout_value.
        assert hasattr(info, 'rollout_value'), (
            'Expect info.rollout_value to exist for computing returns '
            'and advatages inside PPOLoss.')
        return super()._calc_returns_and_advantages(info, info.rollout_value)