# Copyright (c) 2019 Horizon Robotics. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Loss for PPO algorithm."""
import torch
import alf
from alf.algorithms.actor_critic_loss import ActorCriticLoss
from alf.utils.losses import element_wise_squared_loss
from alf.utils import value_ops
[docs]@alf.configurable
class PPOLoss(ActorCriticLoss):
"""PPO loss."""
def __init__(self,
gamma=0.99,
td_error_loss_fn=element_wise_squared_loss,
td_lambda=0.95,
normalize_advantages=True,
compute_advantages_internally=False,
advantage_clip=None,
entropy_regularization=None,
td_loss_weight=1.0,
importance_ratio_clipping=0.2,
log_prob_clipping=0.0,
check_numerics=False,
debug_summaries=False,
name='PPOLoss'):
"""Implement the simplified surrogate loss in equation (9) of `Proximal
Policy Optimization Algorithms <https://arxiv.org/abs/1707.06347>`_.
The total loss equals to
.. code-block:: python
(policy_gradient_loss # (L^{CLIP} in equation (9))
+ td_loss_weight * td_loss # (L^{VF} in equation (9))
- entropy_regularization * entropy)
This loss works with ``PPOAlgorithm``. The advantages and returns are
pre-computed by ``PPOAlgorithm.preprocess()``. One known difference with
`baselines.ppo2` is that value estimation is not clipped here, while
`baselines.ppo2` also clipped value if it deviates from returns too
much.
Args:
gamma (float|list[float]): A discount factor for future rewards. For
multi-dim reward, this can also be a list of discounts, each
discount applies to a reward dim.
td_errors_loss_fn (Callable): A function for computing the TD errors
loss. This function takes as input the target and the estimated
Q values and returns the loss for each element of the batch.
td_lambda (float): Lambda parameter for TD-lambda computation.
normalize_advantages (bool): If True, normalize advantage to zero
mean and unit variance within batch for caculating policy
gradient.
compute_advantages_internally (bool): Normally PPOLoss does not
compute the adavantage and it expects the info to carry the
already-computed advantage. If this flag is set to True, PPOLoss
will instead compute the advantage internally without depending
on the input info, because loading very large amount of
experiences into GPU memory to compute advantages may not always
be possible.
advantage_clip (float): If set, clip advantages to :math:`[-x, x]`
entropy_regularization (float): Coefficient for entropy
regularization loss term.
td_loss_weight (float): the weigt for the loss of td error.
importance_ratio_clipping (float): Epsilon in clipped, surrogate
PPO objective. See the cited paper for more detail.
log_prob_clipping (float): If >0, clipping log probs to the range
``(-log_prob_clipping, log_prob_clipping)`` to prevent ``inf/NaN``
values.
check_numerics (bool): If true, checking for ``NaN/Inf`` values. For
debugging only.
name (str):
"""
super(PPOLoss, self).__init__(
gamma=gamma,
td_error_loss_fn=td_error_loss_fn,
use_gae=True,
td_lambda=td_lambda,
use_td_lambda_return=True,
normalize_advantages=normalize_advantages,
advantage_clip=advantage_clip,
entropy_regularization=entropy_regularization,
td_loss_weight=td_loss_weight,
debug_summaries=debug_summaries,
name=name)
self._importance_ratio_clipping = importance_ratio_clipping
self._log_prob_clipping = log_prob_clipping
self._check_numerics = check_numerics
self._compute_advantages_internally = compute_advantages_internally
def _pg_loss(self, info, advantages):
scope = alf.summary.scope(self._name)
importance_ratio, importance_ratio_clipped = value_ops.action_importance_ratio(
action_distribution=info.action_distribution,
rollout_action_distribution=info.rollout_action_distribution,
action=info.action,
rollout_log_prob=info.rollout_log_prob,
clipping_mode='double_sided',
scope=scope,
importance_ratio_clipping=self._importance_ratio_clipping,
log_prob_clipping=self._log_prob_clipping,
check_numerics=self._check_numerics,
debug_summaries=self._debug_summaries)
# Pessimistically choose the maximum objective value for clipped and
# unclipped importance ratios.
pg_objective = -importance_ratio * advantages
pg_objective_clipped = -importance_ratio_clipped * advantages
policy_gradient_loss = torch.max(pg_objective, pg_objective_clipped)
if self._debug_summaries and alf.summary.should_record_summaries():
with scope:
alf.summary.histogram('pg_objective', pg_objective)
alf.summary.histogram('pg_objective_clipped',
pg_objective_clipped)
if self._check_numerics:
assert torch.all(torch.isfinite(policy_gradient_loss))
return policy_gradient_loss
def _calc_returns_and_advantages(self, info, value):
if not self._compute_advantages_internally:
return info.returns, info.advantages
# If rollout_value is present in ``info``, we use it to compute the
# advantage. This is mainly for algorithms like PPG where at this time
# info.value is the newly computed value which is different from the
# rollout time value prediction. The rollout time value prediction is
# preserved in info.rollout_value.
assert hasattr(info, 'rollout_value'), (
'Expect info.rollout_value to exist for computing returns '
'and advatages inside PPOLoss.')
return super()._calc_returns_and_advantages(info, info.rollout_value)