Source code for alf.algorithms.config

# Copyright (c) 2020 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import alf
from alf.utils.schedulers import as_scheduler


[docs]@alf.configurable
class TrainerConfig(object):
    """Configuration for training."""

    def __init__(self,
                 root_dir,
                 conf_file='',
                 ml_type='rl',
                 algorithm_ctor=None,
                 data_transformer_ctor=None,
                 random_seed=None,
                 num_iterations=1000,
                 num_env_steps=0,
                 unroll_length=8,
                 unroll_with_grad=False,
                 async_unroll: bool = False,
                 max_unroll_length: int = 0,
                 unroll_queue_size: int = 200,
                 unroll_step_interval: float = 0,
                 unroll_parameter_update_period: int = 10,
                 use_rollout_state=False,
                 temporally_independent_train_step=None,
                 num_checkpoints=10,
                 confirm_checkpoint_upon_crash=True,
                 no_thread_env_for_conf=False,
                 evaluate=False,
                 num_evals=None,
                 eval_interval=10,
                 epsilon_greedy=0.,
                 eval_uncertainty=False,
                 num_eval_episodes=10,
                 num_eval_environments: int = 1,
                 async_eval: bool = True,
                 ddp_paras_check_interval: int = 0,
                 num_summaries=None,
                 summary_interval=50,
                 summarize_first_interval=True,
                 update_counter_every_mini_batch=False,
                 summaries_flush_secs=1,
                 summary_max_queue=10,
                 metric_min_buffer_size=10,
                 debug_summaries=False,
                 profiling=False,
                 enable_amp=False,
                 code_snapshots=None,
                 summarize_grads_and_vars=False,
                 summarize_gradient_noise_scale=False,
                 summarize_action_distributions=False,
                 summarize_output=False,
                 initial_collect_steps=0,
                 num_updates_per_train_iter=4,
                 mini_batch_length=None,
                 mini_batch_size=None,
                 whole_replay_buffer_training=True,
                 replay_buffer_length=1024,
                 priority_replay=False,
                 priority_replay_alpha=0.7,
                 priority_replay_beta=0.4,
                 priority_replay_eps=1e-6,
                 offline_buffer_dir=None,
                 offline_buffer_length=None,
                 rl_train_after_update_steps=0,
                 rl_train_every_update_steps=1,
                 empty_cache: bool = False,
                 normalize_importance_weights_by_max: bool = False,
                 clear_replay_buffer=True):
        """
        Args:
            root_dir (str): directory for saving summary and checkpoints
            ml_type (str): type of learning task, one of ['rl', 'sl']
            algorithm_ctor (Callable): callable that create an
                ``OffPolicyAlgorithm`` or ``OnPolicyAlgorithm`` instance
            data_transformer_ctor (Callable|list[Callable]): Function(s)
                for creating data transformer(s). Each of them will be called
                as ``data_transformer_ctor(observation_spec)`` to create a data
                transformer. Available transformers are in ``algorithms.data_transformer``.
                The data transformer constructed by this can be access as
                ``TrainerConfig.data_transformer``.
                Important Note: ``HindsightExperienceTransformer``, ``FrameStacker`` or
                any data transformer that need to access the replay buffer
                for additional data need to be before all other data transformers.
                The reason is the following:
                In off policy training, the replay buffer stores raw input w/o being
                processed by any data transformer.  If say ``ObservationNormalizer`` is
                applied before hindsight, then data retrieved by replay will be
                normalized whereas hindsight data directly pulled from the replay buffer
                will not be normalized.  Data will be in mismatch, causing training to
                suffer and potentially fail.
            random_seed (None|int): random seed, a random seed is used if None
            num_iterations (int): For RL trainer, indicates number of update
                iterations (ignored if 0). Note that for off-policy algorithms, if
                ``initial_collect_steps>0``, then the first
                ``initial_collect_steps//(unroll_length*num_envs)`` iterations
                won't perform any training. For SL trainer, indicates the number
                of training epochs. If both `num_iterations` and `num_env_steps`
                are set, `num_iterations` must be big enough to consume so many
                environment steps. And after `num_env_steps` enviroment steps are
                generated, the training will not interact with environments
                anymore, which means that it will only train on replay buffer.
            num_env_steps (int): number of environment steps (ignored if 0). The
                total number of FRAMES will be (``num_env_steps*frame_skip``) for
                calculating sample efficiency. See alf/environments/wrappers.py
                for the definition of FrameSkip.
            unroll_length (float):  number of time steps each environment proceeds per
                iteration. The total number of time steps from all environments per
                iteration can be computed as: ``num_envs * env_batch_size * unroll_length``.
                If ``unroll_length`` is not an integer, the actual unroll_length
                being used will fluctuate between ``floor(unroll_length)`` and
                ``ceil(unroll_length)`` and the expectation will be equal to
                ``unroll_length``.
            unroll_with_grad (bool): a bool flag indicating whether we require
                grad during ``unroll()``. This flag is only used by
                ``OffPolicyAlgorithm`` where unrolling with grads is usually
                unnecessary and turned off for saving memory. However, when there
                is an on-policy sub-algorithm, we can enable this flag for its
                training. ``OnPolicyAlgorithm`` always unrolls with grads and this
                flag doesn't apply to it.
            async_unroll: whether to unroll asynchronously. If True, unroll will
                be performed in parallel with training.
            max_unroll_length: the maximal length of unroll results for each iteration.
                If the time for one step of training is less than the time for
                unrolling ``max_unroll_length`` steps, the length of the unroll
                results will be less than ``max_unroll_length``. Only used if
                ``async_unroll`` is True and unroll_length==0.
            unroll_queue_size: the size of the queue for transmitting unroll
                results from the unroll process to the main process. Only used
                if ``async_unroll`` is True. If the queue is full, the unroll process
                will wait for the main process to retrieve unroll results from
                the queue before performing more unrolls.
            unroll_step_interval: if not zero, the time interval in second
                between each two environment steps. Only used if ``async_unroll`` is True.
                This is useful if the interaction with the environment happens
                in real time (e.g. real world robot or real time simulation) and
                you want a fixed interaction frequency with the environment.
                Note that this will not has any effect if environment step and
                rollout step together spend more than unroll_step_interval.
            unroll_parameter_update_period: update the parameter for the asynchronous
                unroll every so many interations. Only used if ``async_unroll`` is True.
            use_rollout_state (bool): If True, when off-policy training, the RNN
                states will be taken from the replay buffer; otherwise they will
                be set to 0. In the case of True, the ``train_state_spec`` of an
                algorithm should always be a subset of the ``rollout_state_spec``.
            temporally_independent_train_step (bool|None): If True, the ``train_step``
                is called with all the experiences in one batch instead of being
                called sequentially with ``mini_batch_length`` batches. Only used
                by ``OffPolicyAlgorithm``. In general, this option can only be
                used if the algorithm has no state. For Algorithm with state (e.g.
                ``SarsaAlgorithm`` not using RNN), if there is no need to
                recompute state at train_step, this option can also be used. If
                ``None``, its value is inferred based on whether the algorithm
                has RNN state (``True`` if there is RNN state, ``False`` if not).
            num_checkpoints (int): how many checkpoints to save for the training
            confirm_checkpoint_upon_crash (bool): whether to prompt for whether
                do checkpointing after crash.
            no_thread_env_for_conf (bool): not to create an unwrapped env for
                the purpose of showing operative configurations. If True, no
                ``ThreadEnvironment`` will ever be created, regardless of the
                value of ``TrainerConfig.evaluate``. If False, a
                ``ThreadEnvironment`` will be created if ``TrainerConfig.evaluate``
                or the training env is a ``ParallelAlfEnvironment`` instance.
                For an env that consume lots of resources, this flag can be set to
                ``True`` if no evaluation is needed to save resources. The decision
                of creating an unwrapped env won't affect training; it's used to
                correctly display inoperative configurations in subprocesses.
            evaluate (bool): A bool to evaluate when training
            num_evals (int): how many evaluations are needed throughout the training.
                If not None, an automatically calculated ``eval_interval`` will
                replace ``config.eval_interval``.
            eval_interval (int): evaluate every so many iteration
            epsilon_greedy (float): a floating value in [0,1], representing the
                chance of action sampling instead of taking argmax. This can
                help prevent a dead loop in some deterministic environment like
                Breakout. Only used for evaluation.
            eval_uncertainty (bool): whether to evluate uncertainty after training.
            num_eval_episodes (int) : number of episodes for one evaluation.
            num_eval_environments: the number of environments for evaluation.
            async_eval: whether to do evaluation asynchronously in a different
                process. Note that this may use more memory.
            ddp_paras_check_interval: if >0, then every so many iterations the trainer
                will perform a consistency check of the model parameters across
                different worker processes, if multi-gpu training is used.
            num_summaries (int): how many summary calls are needed throughout the
                training. If not None, an automatically calculated ``summary_interval``
                will replace ``config.summary_interval``. Note that this number
                doesn't include the summary steps of the first interval if
                ``summarize_first_interval=True``. In this case, the actual number
                of summaries will be roughly this number plus the calculated
                summary interval.
            summary_interval (int): write summary every so many training steps
            summarize_first_interval (bool): whether to summarize every step of
                the first interval (default True). It might be better to turn
                this off for an easier post-processing of the curve.
            update_counter_every_mini_batch (bool): whether to update counter
                for every mini batch. The ``summary_interval`` is based on this
                counter. Typically, this should be False. Set to True if you
                want to have summary for every mini batch for the purpose of
                debugging. Only used by ``OffPolicyAlgorithm``.
            summaries_flush_secs (int): flush summary to disk every so many seconds
            summary_max_queue (int): flush to disk every so mary summaries
            metric_min_buffer_size (int): a minimal size of the buffer used to
                construct some average episodic metrics used in ``RLAlgorithm``.
            debug_summaries (bool): A bool to gather debug summaries.
            profiling (bool): If True, use cProfile to profile the training. The
                profile result will be written to ``root_dir``/py_train.INFO.
            enable_amp: whether to use automatic mixed precision for training.
                This can makes the training faster if the algorithm is GPU intensive.
                However, the result may be different (mostly likely due to random
                fluctuation).
            code_snapshots (list[str]): an optional list of code files to write
                to tensorboard text. Note: the code file path should be relative
                to "<ALF_ROOT>/alf", e.g., "algorithms/agent.py". This can be
                useful for tracking code changes when running a job.
            summarize_grads_and_vars (bool): If True, gradient and network variable
                summaries will be written during training.
            summarize_gradient_noise_scale (bool): whether summarize gradient
                noise scale. See ``alf.optimizers.utils.py`` for details.
            summarize_output (bool): If True, summarize output of certain networks.
            initial_collect_steps (int): if positive, number of steps each single
                environment steps before perform first update. Only used
                by ``OffPolicyAlgorithm``.
            num_updates_per_train_iter (int): number of optimization steps for
                one iteration. Only used by ``OffPolicyAlgorithm``.
            mini_batch_size (int): number of sequences for each minibatch. If None,
                it's set to the replayer's ``batch_size``. Only used by
                ``OffPolicyAlgorithm``.
            mini_batch_length (int): the length of the sequence for each
                sample in the minibatch. Only used by ``OffPolicyAlgorithm``.
            whole_replay_buffer_training (bool): whether use all data in replay
                buffer to perform one update. Only used by ``OffPolicyAlgorithm``.
            clear_replay_buffer (bool): whether use all data in replay buffer to
                perform one update and then wiped clean. Only used by
                ``OffPolicyAlgorithm``.
            replay_buffer_length (int): the maximum number of steps the replay
                buffer store for each environment. Only used by
                ``OffPolicyAlgorithm``.
            priority_replay (bool): Use prioritized sampling if this is True.
            priority_replay_alpha (float|Scheduler): The priority from LossInfo is powered
                to this as an argument for ``ReplayBuffer.update_priority()``.
                Note that the effect of ``ReplayBuffer.initial_priority``
                may change with different values of ``priority_replay_alpha``.
                Hence you may need to adjust ``ReplayBuffer.initial_priority``
                accordingly.
            priority_replay_beta (float|Scheduler): weight the loss of each sample by
                ``importance_weight**(-priority_replay_beta)``, where ``importance_weight``
                is from the BatchInfo returned by ``ReplayBuffer.get_batch()``.
                This is only useful if ``prioritized_sampling`` is enabled for
                ``ReplayBuffer``.
            priority_replay_eps (float): minimum priority for priority replay.
            offline_buffer_dir (str|[str]): path to the offline replay buffer
                checkpoint to be loaded. If a list of strings provided, each
                will represent the directory to one replay buffer checkpoint.
            offline_buffer_length (int): the maximum length will be loaded
                from each replay buffer checkpoint. Therefore the total
                buffer length is offline_buffer_length * len(offline_buffer_dir).
                If None, all the samples from all the provided replay buffer
                checkpoints will be loaded.
            rl_train_after_update_steps (int): only used in the hybrid training
                mode. It is used as a starting criteria for the normal (non-offline)
                part of the RL training, which only starts after so many number
                of update steps (according to ``global_counter``).
            rl_train_every_update_steps (int): only used in the hybrid training
                mode. It is used to control the update frequency of the normal
                (non-offline) part of the RL training  (according to
                ``global_counter``). Through this flag, we can have a more fine
                grained control over the update frequencies of online and offline
                RL training (currently assumes the training frequency of offline
                RL is always higher or equal to the online RL part).
                For example, we can set ``rl_train_every_update_steps = 2``
                to have a train config that executes online RL training at the
                half frequency of that of the offline RL training.
            empty_cache: empty GPU memory cache at the start of every iteration
                to reduce GPU memory usage. This option may slightly slow down
                the overall speed.
            normalize_importance_weights_by_max: if True, normalize the importance
                weights by its max to prevent instability caused by large importance
                weight.
        """
        if isinstance(priority_replay_beta, float):
            assert priority_replay_beta >= 0.0, (
                "importance_weight_beta should be non-negative")
        assert ml_type in ('rl', 'sl')
        self.root_dir = root_dir
        self.conf_file = conf_file
        self.ml_type = ml_type
        self.algorithm_ctor = algorithm_ctor
        self.data_transformer_ctor = data_transformer_ctor
        self.data_transformer = None  # to be set by Trainer
        self.random_seed = random_seed
        self.num_iterations = num_iterations
        self.num_env_steps = num_env_steps
        self.unroll_length = unroll_length
        self.unroll_with_grad = unroll_with_grad
        self.async_unroll = async_unroll
        if async_unroll:
            assert not unroll_with_grad, ("unroll_with_grad is not supportd "
                                          "for async_unroll=True")
            assert max_unroll_length > 0, ("max_unroll_length needs to be set "
                                           "for async_unroll=True")
        self.max_unroll_length = max_unroll_length or self.unroll_length
        self.unroll_queue_size = unroll_queue_size
        self.unroll_step_interval = unroll_step_interval
        self.unroll_parameter_update_period = unroll_parameter_update_period
        self.use_rollout_state = use_rollout_state
        self.temporally_independent_train_step = temporally_independent_train_step
        self.num_checkpoints = num_checkpoints
        self.confirm_checkpoint_upon_crash = confirm_checkpoint_upon_crash
        self.no_thread_env_for_conf = no_thread_env_for_conf
        self.evaluate = evaluate
        self.num_evals = num_evals
        self.eval_interval = eval_interval
        self.epsilon_greedy = epsilon_greedy
        self.eval_uncertainty = eval_uncertainty
        self.num_eval_episodes = num_eval_episodes
        self.num_eval_environments = num_eval_environments
        self.async_eval = async_eval
        self.ddp_paras_check_interval = ddp_paras_check_interval
        self.num_summaries = num_summaries
        self.summary_interval = summary_interval
        self.summarize_first_interval = summarize_first_interval
        self.update_counter_every_mini_batch = update_counter_every_mini_batch
        self.summaries_flush_secs = summaries_flush_secs
        self.summary_max_queue = summary_max_queue
        self.metric_min_buffer_size = metric_min_buffer_size
        self.debug_summaries = debug_summaries
        self.profiling = profiling
        self.enable_amp = enable_amp
        self.code_snapshots = code_snapshots
        self.summarize_grads_and_vars = summarize_grads_and_vars
        self.summarize_gradient_noise_scale = summarize_gradient_noise_scale
        self.summarize_action_distributions = summarize_action_distributions
        self.summarize_output = summarize_output
        self.initial_collect_steps = initial_collect_steps
        self.num_updates_per_train_iter = num_updates_per_train_iter
        self.mini_batch_length = mini_batch_length
        self.mini_batch_size = mini_batch_size
        self.whole_replay_buffer_training = whole_replay_buffer_training
        self.clear_replay_buffer = clear_replay_buffer
        self.replay_buffer_length = replay_buffer_length
        self.priority_replay = priority_replay
        self.priority_replay_alpha = as_scheduler(priority_replay_alpha)
        self.priority_replay_beta = as_scheduler(priority_replay_beta)
        self.priority_replay_eps = priority_replay_eps
        # offline options
        self.offline_buffer_dir = offline_buffer_dir
        self.offline_buffer_length = offline_buffer_length
        self.rl_train_after_update_steps = rl_train_after_update_steps
        self.rl_train_every_update_steps = rl_train_every_update_steps
        self.empty_cache = empty_cache
        self.normalize_importance_weights_by_max = normalize_importance_weights_by_max