Source code for alf.networks.encoding_networks

# Copyright (c) 2020 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import numpy as np
from typing import Callable, Optional, Tuple, Union

import torch
import torch.nn as nn
from .containers import _Sequential
from .network import Network
import alf
import alf.layers as layers
from alf.initializers import variance_scaling_init
from alf.tensor_specs import TensorSpec
from alf.utils import common
from alf.nest.utils import get_outer_rank


[docs]@alf.configurable class ImageEncodingNetwork(_Sequential): """ A general template class for creating convolutional encoding networks. """ def __init__(self, input_channels, input_size, conv_layer_params, same_padding=False, activation=torch.relu_, kernel_initializer=None, flatten_output=False, name="ImageEncodingNetwork"): """ Initialize the layers for encoding an image into a latent vector. Currently there seems no need for this class to handle nested inputs; If necessary, extend the argument list to support it in the future. How to calculate the output size: `<https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html>`_:: H = (H1 - HF + 2P) // strides + 1 where H = output size, H1 = input size, HF = size of kernel, P = padding. Regarding padding: in the previous TF version, we have two padding modes: ``valid`` and ``same``. For the former, we always have no padding (P=0); for the latter, it's also called "half padding" (P=(HF-1)//2 when strides=1 and HF is an odd number the output has the same size with the input. Currently, PyTorch don't support different left and right paddings and P is always (HF-1)//2. So if HF is an even number, the output size will decrease by 1 when strides=1). Args: input_channels (int): number of channels in the input image input_size (int or tuple): the input image size (height, width) conv_layer_params (tuppe[tuple]): a non-empty tuple of tuple (num_filters, kernel_size, strides, padding), where padding is optional same_padding (bool): similar to TF's conv2d ``same`` padding mode. If True, the user provided paddings in `conv_layer_params` will be replaced by automatically calculated ones; if False, it corresponds to TF's ``valid`` padding mode (the user can still provide custom paddings though) activation (torch.nn.functional): activation for all the layers kernel_initializer (Callable): initializer for all the layers. flatten_output (bool): If False, the output will be an image structure of shape ``BxCxHxW``; otherwise the output will be flattened into a feature of shape ``BxN``. """ input_size = common.tuplify2d(input_size) input_tensor_spec = TensorSpec((input_channels, ) + input_size) assert isinstance(conv_layer_params, tuple) assert len(conv_layer_params) > 0 nets = [] for paras in conv_layer_params: filters, kernel_size, strides = paras[:3] padding = paras[3] if len(paras) > 3 else 0 if same_padding: # overwrite paddings kernel_size = common.tuplify2d(kernel_size) padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2) nets.append( layers.Conv2D( input_channels, filters, kernel_size, activation=activation, kernel_initializer=kernel_initializer, strides=strides, padding=padding)) input_channels = filters if flatten_output: nets.append(alf.layers.Reshape((-1, ))) super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]@alf.configurable class ImageDecodingNetwork(_Sequential): """ A general template class for creating transposed convolutional decoding networks. """ def __init__(self, input_size, transconv_layer_params, start_decoding_size, start_decoding_channels, same_padding=False, preprocess_fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, output_activation=torch.tanh, name="ImageDecodingNetwork"): """ Initialize the layers for decoding a latent vector into an image. Currently there seems no need for this class to handle nested inputs; If necessary, extend the argument list to support it in the future. How to calculate the output size: `<https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html>`_:: H = (H1-1) * strides + HF - 2P + OP where H = output size, H1 = input size, HF = size of kernel, P = padding, OP = output_padding (currently hardcoded to be 0 for this class). Regarding padding: in the previous TF version, we have two padding modes: ``valid`` and ``same``. For the former, we always have no padding (P=0); for the latter, it's also called ``half padding`` (P=(HF-1)//2 when strides=1 and HF is an odd number the output has the same size with the input. Currently, PyTorch doesn't support different left and right paddings and P is always (HF-1)//2. So if HF is an even number, the output size will increaseby 1 when strides=1). Args: input_size (int): the size of the input latent vector transconv_layer_params (tuple[tuple]): a non-empty tuple of tuple (num_filters, kernel_size, strides, padding), where ``padding`` is optional. start_decoding_size (int or tuple): the initial height and width we'd like to have for the feature map start_decoding_channels (int): the initial number of channels we'd like to have for the feature map. Note that we always first project an input latent vector into a vector of an appropriate length so that it can be reshaped into (``start_decoding_channels``, ``start_decoding_height``, ``start_decoding_width``). same_padding (bool): similar to TF's conv2d ``same`` padding mode. If True, the user provided paddings in ``transconv_layer_params`` will be replaced by automatically calculated ones; if False, it corresponds to TF's ``valid`` padding mode (the user can still provide custom paddings though). preprocess_fc_layer_params (tuple[int]): a tuple of fc layer units. These fc layers are used for preprocessing the latent vector before transposed convolutions. activation (nn.functional): activation for hidden layers kernel_initializer (Callable): initializer for all the layers. output_activation (nn.functional): activation for the output layer. Usually our image inputs are normalized to [0, 1] or [-1, 1], so this function should be ``torch.sigmoid`` or ``torch.tanh``. name (str): """ input_tensor_spec = TensorSpec((input_size, )) assert isinstance(transconv_layer_params, tuple) assert len(transconv_layer_params) > 0 nets = [] if preprocess_fc_layer_params is not None: for size in preprocess_fc_layer_params: nets.append( layers.FC( input_size, size, activation=activation, kernel_initializer=kernel_initializer)) input_size = size start_decoding_size = common.tuplify2d(start_decoding_size) # pytorch assumes "channels_first" ! start_decoding_shape = [ start_decoding_channels, start_decoding_size[0], start_decoding_size[1] ] nets.append( layers.FC( input_size, np.prod(start_decoding_shape), activation=activation, kernel_initializer=kernel_initializer)) nets.append(alf.layers.Reshape(start_decoding_shape)) in_channels = start_decoding_channels for i, paras in enumerate(transconv_layer_params): filters, kernel_size, strides = paras[:3] padding = paras[3] if len(paras) > 3 else 0 if same_padding: # overwrite paddings kernel_size = common.tuplify2d(kernel_size) padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2) act = activation if i == len(transconv_layer_params) - 1: act = output_activation nets.append( layers.ConvTranspose2D( in_channels, filters, kernel_size, activation=act, kernel_initializer=kernel_initializer, strides=strides, padding=padding)) in_channels = filters super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]@alf.configurable class ImageDecodingNetworkV2(_Sequential): """Image decoding using upsampling+convolution. Different with ``ImageDecodingNetwork`` which uses transposed convolution to transform a smaller input to a larger image output, this class uses upsampling followed by convolution layers. The idea is to let conv layer refine the upsampling (e.g., nearest neighbor, bilinear, etc) results. The difference between transposed conv and upsampling+conv can be found in this article: `<https://distill.pub/2016/deconv-checkerboard/>`_. In short, upsampling+conv might help reduce checkerboard artifacts that are common in the outputs by transposed convolutions. """ def __init__(self, input_size: int, upsample_conv_layer_params: Tuple[Union[int, Tuple[int]]], start_decoding_size: Union[int, Tuple[int]], start_decoding_channels: int, preprocess_fc_layer_params: Tuple[int] = None, upsampling_mode: str = 'nearest', same_padding: bool = False, activation: Callable = torch.relu_, kernel_initializer: Callable = None, output_activation: Callable = torch.tanh, name: str = "ImageDecodingNetworkV2"): """An example network of upsampling+conv for decoding images. .. code-block:: python net = ImageDecodingNetworkV2(input_size=100, start_decoding_size=10, start_decoding_channels=8, same_padding=True, upsample_conv_layer_params=( 2, (16, 3, 1), (32, 3, 1), 2, (64, 3, 1), (3, 3, 1))) # The image shape: (8,10,10) -> (8,20,20) -> (16,20,20) -> (32,20,20) # -> (32,40,40) -> (64,40,40) -> (3,40,40) Args: input_size: the size of the input latent vector upsample_conv_layer_params: a tuple of ints or tuples. If the element is an int, it represents the scaling factor for a ``torch.nn.Upsample`` layer; otherwise it should a tuple of ints representing conv params ``(num_filters, kernel_size, strides, padding)``, where ``padding`` is optional. start_decoding_size: the initial height and width we'd like to have for the feature map. start_decoding_channels: the initial number of channels we'd like to have for the feature map. Note that we always first project an input latent vector into a vector of an appropriate length so that it can be reshaped into (``start_decoding_channels``, ``start_decoding_height``, ``start_decoding_width``). preprocess_fc_layer_params: if not None, then the input will be fed to a list of fc layers specified by this argument, before doing deconvolution. upsampling_mode: the argument for choosing an upsampling algorithm for ``torch.nn.Upsample``. same_padding: similar to TF's conv2d ``same`` padding mode. If True, the user provided paddings in ``transconv_layer_params`` will be replaced by automatically calculated ones; if False, it corresponds to TF's ``valid`` padding mode (the user can still provide custom paddings though). Please refer to the docstring of ``ImageEncodingNetwork`` for definitions of the two padding modes. activation: activation for hidden layers kernel_initializer: initializer for all the layers. output_activation: activation for the output layer. Usually our image inputs are normalized to [0, 1] or [-1, 1], so this function should be ``torch.sigmoid`` or ``torch.tanh``. name (str): """ input_tensor_spec = TensorSpec((input_size, )) assert isinstance(upsample_conv_layer_params, tuple) assert len(upsample_conv_layer_params) > 0 start_decoding_size = common.tuplify2d(start_decoding_size) # pytorch assumes "channels_first" ! start_decoding_shape = [ start_decoding_channels, start_decoding_size[0], start_decoding_size[1] ] nets = [] if preprocess_fc_layer_params is not None: for size in preprocess_fc_layer_params: nets.append( layers.FC( input_size, size, activation=activation, kernel_initializer=kernel_initializer)) input_size = size nets.extend([ layers.FC( input_size, np.prod(start_decoding_shape), activation=activation, kernel_initializer=kernel_initializer), alf.layers.Reshape(start_decoding_shape) ]) in_channels = start_decoding_channels for i, paras in enumerate(upsample_conv_layer_params): if isinstance(paras, int): nets.append( torch.nn.Upsample( scale_factor=paras, mode=upsampling_mode)) else: filters, kernel_size, strides = paras[:3] padding = paras[3] if len(paras) > 3 else 0 if same_padding: # overwrite paddings kernel_size = common.tuplify2d(kernel_size) padding = ((kernel_size[0] - 1) // 2, (kernel_size[1] - 1) // 2) act = activation if i == len(upsample_conv_layer_params) - 1: act = output_activation nets.append( layers.Conv2D( in_channels, filters, kernel_size, activation=act, kernel_initializer=kernel_initializer, strides=strides, padding=padding)) in_channels = filters super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]def SpatialBroadcastDecodingNetwork( input_size: int, output_height: int, conv_layer_params: Tuple[Tuple[int]], output_width: int = None, fc_layer_params: Tuple[int] = None, activation: Callable = torch.relu_, output_activation: Callable = alf.utils.math_ops.identity, name: str = "SpatialBroadcastDecodingNetwork"): """Implements the spatial broadcast decoder in `Watters et al. 2019, Spatial Broadcast Decoder: A Simple Architecture for Learning Disentangled Representations in VAEs <https://arxiv.org/abs/1901.07017>`_. In short, given a latent embedding and target output height/width, this decoder first spatially broadcast the embedding over ``height*width``, append a uniform ``xy`` meshgrid in [-1,1], and apply conv layers. Args: input_size: the latent embedding size output_height: the target output image height conv_layer_params: a tuple of conv layer params after broadcasting output_width: if None, it's equal to ``output_height`` fc_layer_params: a tuple of fc layers applied to the input embedding before broadcasting activation: activation of the intermediate conv layers output_activation: the final activation """ input_tensor_spec = TensorSpec((input_size, )) proj = alf.math.identity if fc_layer_params is not None: proj = EncodingNetwork( input_tensor_spec=input_tensor_spec, fc_layer_params=fc_layer_params, activation=activation) if output_width is None: output_width = output_height preproc_net = alf.nn.Sequential( proj, functools.partial( alf.utils.tensor_utils.spatial_broadcast, im_shape=(output_height, output_width)), alf.utils.tensor_utils.append_coordinate, input_tensor_spec=input_tensor_spec) assert isinstance(conv_layer_params, tuple) and len(conv_layer_params) > 0 conv_net = ImageEncodingNetwork( input_channels=preproc_net.output_spec.shape[0], input_size=preproc_net.output_spec.shape[1:], conv_layer_params=conv_layer_params[:-1], same_padding=True, activation=activation) last_conv_net = ImageEncodingNetwork( input_channels=conv_net.output_spec.shape[0], input_size=conv_net.output_spec.shape[1:], conv_layer_params=conv_layer_params[-1:], same_padding=True, activation=output_activation) return alf.nn.Sequential(preproc_net, conv_net, last_conv_net, name=name)
[docs]@alf.configurable class AutoShapeImageDeconvNetwork(_Sequential): """ A general template class for creating image deconv (transposed convolutional) networks with auto-shape inference (thus named as ``AutoShapeImageDeconvNetwork``). """ def __init__(self, input_size: int, transconv_layer_params: Tuple, output_shape: Tuple, start_decoding_channels: int, preprocess_fc_layer_params: Optional[Tuple] = None, activation: Optional[Callable] = torch.relu_, kernel_initializer: Optional[Callable] = None, output_activation: Optional[Callable] = torch.tanh, name="AutoShapeImageDeconvNetwork"): """ Auto-shape inference: instead of specifying an initial start shape for image deconv, this class only needs to specify the desired output shape for the image and will automatically calculate the desired shape to start decoding based on the specified ``transconv_layer_params`` and uses a FC layer to map the to the desired start shape. Args: input_size (int): the size of the input latent vector transconv_layer_params (tuple[tuple]): a non-empty tuple of tuple (num_filters, kernel_size, strides, padding), where ``padding`` is optional. output_shape (tuple): the complete output size would be output_shape = (c, h, w). start_decoding_channels (int): the initial number of channels we'd like to have for the feature map. Note that we always first project an input latent vector into a vector of an appropriate length so that it can be reshaped into (``start_decoding_channels``, ``start_decoding_height``, ``start_decoding_width``), where ``start_decoding_height`` and ``start_decoding_width`` are automatically inferred based on the specified ``output_shape`` and ``transconv_layer_params``. preprocess_fc_layer_params (tuple[int]): a tuple of fc layer units. These fc layers are used for preprocessing the latent vector before transposed convolutions. activation (nn.functional): activation for hidden layers kernel_initializer (Callable): initializer for all the layers. output_activation (nn.functional): activation for the output layer. Usually our image inputs are normalized to [0, 1] or [-1, 1], so this function should be ``torch.sigmoid`` or ``torch.tanh``. name (str): """ assert len(output_shape) == 3, "the output_shape should be (c, h, w)" assert output_shape[0] == transconv_layer_params[-1][0], ( "channel number mis-match") # compute conv shape and padding shape out_paddings = [] out_shape = output_shape[1:] for i, paras in enumerate(transconv_layer_params[::-1]): filters, kernel_size, stride = paras[:3] kernel_size = common.tuplify2d(kernel_size) padding = paras[3] if len(paras) > 3 else 0 padding = common.tuplify2d(padding) conv_shape = self._calc_conv_out_shape(out_shape, padding, kernel_size, stride) out_padding = self._calc_output_padding_shape( out_shape, conv_shape, padding, kernel_size, stride) out_shape = conv_shape out_paddings.append(out_padding) input_tensor_spec = TensorSpec((input_size, )) assert isinstance(transconv_layer_params, tuple) assert len(transconv_layer_params) > 0 nets = [] if preprocess_fc_layer_params is not None: for size in preprocess_fc_layer_params: nets.append( layers.FC( input_size, size, activation=activation, kernel_initializer=kernel_initializer)) input_size = size start_decoding_shape = [ start_decoding_channels, conv_shape[0], conv_shape[1] ] nets.append( layers.FC( input_size, np.prod(start_decoding_shape), activation=activation, kernel_initializer=kernel_initializer)) nets.append(alf.layers.Reshape(start_decoding_shape)) in_channels = start_decoding_channels for i, paras in enumerate(transconv_layer_params): filters, kernel_size, strides = paras[:3] padding = paras[3] if len(paras) > 3 else 0 output_padding = out_paddings[-(i + 1)] act = activation if i == len(transconv_layer_params) - 1: act = output_activation nets.append( layers.ConvTranspose2D( in_channels, filters, kernel_size, activation=act, kernel_initializer=kernel_initializer, strides=strides, padding=padding, output_padding=output_padding)) in_channels = filters super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name) def _calc_conv_out_shape(self, input_size, padding, kernel_size, stride): """Calculate the output shape of a conv2d operation. Reference: `<https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html>`_. """ def _conv_out_1d(input_size, padding, kernel_size, stride): return int((input_size + 2. * padding - kernel_size) / stride + 1.) return tuple( _conv_out_1d(x, p, k, stride) for x, p, k in zip(input_size, padding, kernel_size)) def _calc_output_padding_shape(self, input_size, conv_out, padding, kernel_size, stride): """Calculate the necessary output padding to be used for ``ConvTranspose2D`` to ensure the image obatained from it will have a size that matches the ``input size``. """ def _output_padding_1d(input_size, conv_out, padding, kernel_size, stride): return input_size - ( conv_out - 1) * stride + 2 * padding - kernel_size return tuple(_output_padding_1d(x, c, p, k, stride) for x, c, p, k in \ zip(input_size, conv_out, padding, kernel_size))
[docs]@alf.configurable class EncodingNetwork(_Sequential): """Feed Forward network with CNN and FC layers which allows the last layer to have different settings from the other layers. """ def __init__(self, input_tensor_spec, output_tensor_spec=None, input_preprocessors=None, preprocessing_combiner=None, conv_layer_params=None, fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, use_fc_bn=False, last_layer_size=None, last_activation=None, last_kernel_initializer=None, last_use_fc_bn=False, name="EncodingNetwork"): """ Args: input_tensor_spec (nested TensorSpec): the (nested) tensor spec of the input. If nested, then ``preprocessing_combiner`` must not be None. output_tensor_spec (None|TensorSpec): spec for the output. If None, the output tensor spec will be assumed as ``TensorSpec((output_size, ))``, where ``output_size`` is inferred from network output. Otherwise, the output tensor spec will be ``output_tensor_spec`` and the network output will be reshaped according to ``output_tensor_spec``. Note that ``output_tensor_spec`` is only used for reshaping the network outputs for interpretation purpose and is not used for specifying any network layers. input_preprocessors (nested Network|nn.Module|None): a nest of preprocessors, each of which will be applied to the corresponding input. If not None, then it must have the same structure with ``input_tensor_spec``. This arg is helpful if you want to have separate preprocessings for different inputs by configuring a gin file without changing the code. For example, embedding a discrete input before concatenating it to another continuous vector. preprocessing_combiner (NestCombiner): preprocessing called on complex inputs. Note that this combiner must also accept ``input_tensor_spec`` as the input to compute the processed tensor spec. For example, see ``alf.nest.utils.NestConcat``. This arg is helpful if you want to combine inputs by configuring a gin file without changing the code. conv_layer_params (tuple[tuple]): a tuple of tuples where each tuple takes a format ``(filters, kernel_size, strides, padding)``, where ``padding`` is optional. fc_layer_params (tuple[int]): a tuple of integers representing FC layer sizes. activation (nn.functional): activation used for all the layers but the last layer. kernel_initializer (Callable): initializer for all the layers but the last layer. If None, a variance_scaling_initializer will be used. use_fc_bn (bool): whether use Batch Normalization for fc layers. last_layer_size (int): an optional size of an additional layer appended at the very end. Note that if ``last_activation`` is specified, ``last_layer_size`` has to be specified explicitly. last_activation (nn.functional): activation function of the additional layer specified by ``last_layer_size``. Note that if ``last_layer_size`` is not None, ``last_activation`` has to be specified explicitly. last_use_fc_bn (bool): whether use Batch Normalization for the last fc layer. last_kernel_initializer (Callable): initializer for the the additional layer specified by ``last_layer_size``. If None, it will be the same with ``kernel_initializer``. If ``last_layer_size`` is None, ``last_kernel_initializer`` will not be used. name (str): """ if kernel_initializer is None: kernel_initializer = functools.partial( variance_scaling_init, mode='fan_in', distribution='truncated_normal', nonlinearity=activation) spec = input_tensor_spec nets = [] if input_preprocessors: input_preprocessors = alf.nest.map_structure( lambda p: alf.layers.Identity() if p is None else p, input_preprocessors) net = alf.nn.Parallel(input_preprocessors, input_tensor_spec) spec = net.output_spec nets.append(net) if alf.nest.is_nested(spec): assert preprocessing_combiner is not None, \ ("When a nested input tensor spec is provided, an input " + "preprocessing combiner must also be provided!") spec = preprocessing_combiner(spec) nets.append(preprocessing_combiner) else: assert isinstance(spec, TensorSpec), \ "The spec must be an instance of TensorSpec!" if conv_layer_params: assert isinstance(conv_layer_params, tuple), \ "The input params {} should be tuple".format(conv_layer_params) assert len(spec.shape) == 3, \ "The input shape {} should be like (C,H,W)!".format(spec.shape) input_channels, height, width = spec.shape net = ImageEncodingNetwork( input_channels, (height, width), conv_layer_params, activation=activation, kernel_initializer=kernel_initializer, flatten_output=True) spec = net.output_spec nets.append(net) assert spec.ndim == 1, \ "The input shape {} should be like (N,)!".format(spec.shape) input_size = spec.shape[0] if fc_layer_params is None: fc_layer_params = [] else: assert isinstance(fc_layer_params, tuple) fc_layer_params = list(fc_layer_params) for size in fc_layer_params: nets.append( layers.FC( input_size, size, activation=activation, use_bn=use_fc_bn, kernel_initializer=kernel_initializer)) input_size = size if last_layer_size is not None or last_activation is not None: assert last_layer_size is not None and last_activation is not None, \ "Both last_layer_size and last_activation need to be specified!" if last_kernel_initializer is None: common.warning_once( "last_kernel_initializer is not specified " "for the last layer of size {}.".format(last_layer_size)) last_kernel_initializer = kernel_initializer nets.append( layers.FC( input_size, last_layer_size, activation=last_activation, use_bn=last_use_fc_bn, kernel_initializer=last_kernel_initializer)) input_size = last_layer_size if output_tensor_spec is not None: assert output_tensor_spec.numel == input_size, ( "network output " "size {a} is inconsisent with specified out_tensor_spec " "of size {b}".format(a=input_size, b=output_tensor_spec.numel)) nets.append(alf.layers.Reshape(output_tensor_spec.shape)) super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs] def make_parallel(self, n: int, allow_non_parallel_input=False): """Make a parallelized version of ``module``. A parallel network has ``n`` copies of network with the same structure but different independently initialized parameters. The parallel network can process a batch of the data with shape [batch_size, n, ...] using ``n`` networks with same structure. TODO: remove ``allow_non_parallel_input``. This means to make parallel network not to accept non-parallel input. It will make the logic more transparent. Args: n (int): the number of copies allow_non_parallel_input (bool): if True, the returned network will also accept non-parallel input with shape [batch_size, ...]. In this case, the network will check whether the input is parallel input. If not, the input will be automatically replicated ``n`` times at the beginning. Returns: the parallelized network. """ pnet = super().make_parallel(n) if allow_non_parallel_input: return _ReplicateInputForParallel( self.input_tensor_spec, n, pnet, name=pnet.name) else: return pnet
class _ReplicateInputForParallel(Network): def __init__(self, input_tensor_spec, n, pnet, name): super().__init__( input_tensor_spec, state_spec=pnet.state_spec, name=name) self._input_tensor_spec = input_tensor_spec self._n = n self._pnet = pnet def forward(self, inputs, state=()): outer_rank = get_outer_rank(inputs, self._input_tensor_spec) if outer_rank == 1: inputs = alf.layers.make_parallel_input(inputs, self._n) return self._pnet(inputs, state)
[docs]@alf.configurable def ParallelEncodingNetwork(input_tensor_spec, n, output_tensor_spec=None, input_preprocessors=None, preprocessing_combiner=None, conv_layer_params=None, fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, use_fc_bn=False, last_layer_size=None, last_activation=None, last_kernel_initializer=None, last_use_fc_bn=False, name="ParallelEncodingNetwork"): """Parallel encoding network which effectively runs ``n`` individual encoding network simultaneuosl. Args: input_tensor_spec (nested TensorSpec): the (nested) tensor spec of the input. If nested, then ``preprocessing_combiner`` must not be None. n (int): number of parallel networks output_tensor_spec (None|TensorSpec): spec for the output, excluding the dimension of paralle networks ``n``. If None, the output tensor spec will be assumed as ``TensorSpec((n, output_size, ))``, where ``output_size`` is inferred from network output. Otherwise, the output tensor spec will be ``TensorSpec((n, *output_tensor_spec.shape))`` and the network output will be reshaped accordingly. Note that ``output_tensor_spec`` is only used for reshaping the network outputs for interpretation purpose and is not used for specifying any network layers. input_preprocessors (None): must be ``None``. preprocessing_combiner (NestCombiner): preprocessing called on complex inputs. Note that this combiner must also accept ``input_tensor_spec`` as the input to compute the processed tensor spec. For example, see ``alf.nest.utils.NestConcat``. This arg is helpful if you want to combine inputs by configuring a gin file without changing the code. conv_layer_params (tuple[tuple]): a tuple of tuples where each tuple takes a format ``(filters, kernel_size, strides, padding)``, where ``padding`` is optional. fc_layer_params (tuple[int]): a tuple of integers representing FC layer sizes. activation (nn.functional): activation used for all the layers but the last layer. kernel_initializer (Callable): initializer for all the layers but the last layer. If None, a variance_scaling_initializer will be used. use_fc_bn (bool): whether use Batch Normalization for fc layers. last_layer_size (int): an optional size of an additional layer appended at the very end. Note that if ``last_activation`` is specified, ``last_layer_size`` has to be specified explicitly. last_activation (nn.functional): activation function of the additional layer specified by ``last_layer_size``. Note that if ``last_layer_size`` is not None, ``last_activation`` has to be specified explicitly. last_kernel_initializer (Callable): initializer for the the additional layer specified by ``last_layer_size``. If None, it will be the same with ``kernel_initializer``. If ``last_layer_size`` is None, ``last_kernel_initializer`` will not be used. last_use_fc_bn (bool): whether use Batch Normalization for the last fc layer. name (str): Returns: the parallelized network """ net = EncodingNetwork( input_tensor_spec=input_tensor_spec, output_tensor_spec=output_tensor_spec, input_preprocessors=input_preprocessors, preprocessing_combiner=preprocessing_combiner, conv_layer_params=conv_layer_params, fc_layer_params=fc_layer_params, activation=activation, kernel_initializer=kernel_initializer, use_fc_bn=use_fc_bn, last_layer_size=last_layer_size, last_activation=last_activation, last_kernel_initializer=last_kernel_initializer, last_use_fc_bn=last_use_fc_bn, name=name) return net.make_parallel(n, True)
[docs]@alf.configurable class LSTMEncodingNetwork(_Sequential): """LSTM cells followed by an encoding network.""" def __init__(self, input_tensor_spec, output_tensor_spec=None, input_preprocessors=None, preprocessing_combiner=None, conv_layer_params=None, pre_fc_layer_params=None, hidden_size=(100, ), lstm_output_layers=-1, post_fc_layer_params=None, activation=torch.relu_, kernel_initializer=None, last_layer_size=None, last_activation=None, last_kernel_initializer=None, name="LSTMEncodingNetwork"): """ Args: input_tensor_spec (nested TensorSpec): the (nested) tensor spec of the input. If nested, then ``preprocessing_combiner`` must not be None. output_tensor_spec (None|TensorSpec): spec for the output. If None, the output tensor spec will be assumed as ``TensorSpec((output_size, ))``, where ``output_size`` is inferred from network output. Otherwise, the output tensor spec will be ``output_tensor_spec`` and the network output will be reshaped according to ``output_tensor_spec``. Note that ``output_tensor_spec`` is only used for reshaping the network outputs for interpretation purpose and is not used for specifying any network layers. input_preprocessors (nested Network|nn.Module|None): a nest of input preprocessors, each of which will be applied to the corresponding input. If not None, then it must have the same structure with ``input_tensor_spec``. This arg is helpful if you want to have separate preprocessings for different inputs by configuring a gin file without changing the code. For example, embedding a discrete input before concatenating it to another continuous vector. preprocessing_combiner (NestCombiner): preprocessing called on complex inputs. Note that this combiner must also accept ``input_tensor_spec`` as the input to compute the processed tensor spec. For example, see ``alf.nest.utils.NestConcat``. This arg is helpful if you want to combine inputs by configuring a gin file without changing the code. conv_layer_params (tuple[tuple]): a tuple of tuples where each tuple takes a format ``(filters, kernel_size, strides, padding)``, where ``padding`` is optional. pre_fc_layer_params (tuple[int]): a tuple of integers representing FC layers that are applied before the LSTM cells. hidden_size (int or tuple[int]): the hidden size(s) of the lstm cell(s). Each size corresponds to a cell. If there are multiple sizes, then lstm cells are stacked. lstm_output_layers (None|int|list[int]): -1 means the output from the last lstm layer. ``None`` means all lstm layers. post_fc_layer_params (tuple[int]): an optional tuple of integers representing hidden FC layers that are applied after the LSTM cells. activation (nn.functional): activation for all the layers but the last layer. kernel_initializer (Callable): initializer for all the layers but the last layer. last_layer_size (int): an optional size of an additional layer appended at the very end. Note that if ``last_activation`` is specified, ``last_layer_size`` has to be specified explicitly. last_activation (nn.functional): activation function of the additional layer specified by ``last_layer_size``. Note that if ``last_layer_size`` is not None, ``last_activation`` has to be specified explicitly. last_kernel_initializer (Callable): initializer for the the additional layer specified by ``last_layer_size``. If None, it will be the same with ``kernel_initializer``. If ``last_layer_size`` is None, ``last_kernel_initializer`` will not be used. """ nets = [] if (input_preprocessors or preprocessing_combiner or conv_layer_params or pre_fc_layer_params): net = EncodingNetwork( input_tensor_spec=input_tensor_spec, input_preprocessors=input_preprocessors, preprocessing_combiner=preprocessing_combiner, conv_layer_params=conv_layer_params, fc_layer_params=pre_fc_layer_params, activation=activation, kernel_initializer=kernel_initializer) input_size = net.output_spec.shape[0] nets.append(net) else: input_size = input_tensor_spec.shape[0] if isinstance(hidden_size, int): hidden_size = [hidden_size] else: assert isinstance(hidden_size, tuple) cells = [] for hs in hidden_size: cells.append( alf.nn.LSTMCell(input_size=input_size, hidden_size=hs)) input_size = hs if lstm_output_layers is None: lstm_output_layers = list(range(len(hidden_size))) elif type(lstm_output_layers) == int: lstm_output_layers = [lstm_output_layers] lstm_output_layers = [ len(cells) + i if i < 0 else i for i in lstm_output_layers ] if lstm_output_layers == [len(cells) - 1]: nets.extend(cells) else: if type(lstm_output_layers) == int: lstm_output_layers = [lstm_output_layers] lstms = dict(('lstm%s' % i, cell) for i, cell in enumerate(cells)) lstms['o'] = ( tuple( 'lstm%s' % i for i in lstm_output_layers), # the inputs for NestConcat alf.layers.NestConcat()) nets.append(alf.nn.Sequential(**lstms, name='lstm_block')) input_size = sum(hidden_size[i] for i in lstm_output_layers) if post_fc_layer_params is not None or last_layer_size is not None: net = EncodingNetwork( input_tensor_spec=TensorSpec((input_size, )), fc_layer_params=post_fc_layer_params, activation=activation, kernel_initializer=kernel_initializer, last_layer_size=last_layer_size, last_activation=last_activation, last_kernel_initializer=last_kernel_initializer) nets.append(net) input_size = net.output_spec.numel if output_tensor_spec is not None: assert output_tensor_spec.numel == input_size, ( "network output " "size {a} is inconsisent with specified out_tensor_spec " "of size {b}".format(a=input_size, b=output_tensor_spec.numel)) nets.append(alf.layers.Reshape(output_tensor_spec.shape)) super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs] def make_parallel(self, n: int, allow_non_parallel_input=False): """Make a parallelized version of ``module``. A parallel network has ``n`` copies of network with the same structure but different independently initialized parameters. The parallel network can process a batch of the data with shape [batch_size, n, ...] using ``n`` networks with same structure. Args: n (int): the number of copies allow_non_parallel_input (bool): if True, the returned network will also accept non-parallel input with shape [batch_size, ...]. In this case, the network will check whether the input is parallel input. If not, the input will be automatically replicated ``n`` times at the beginning. Returns: the parallelized network. """ pnet = super().make_parallel(n) if allow_non_parallel_input: return _ReplicateInputForParallel( self.input_tensor_spec, n, pnet, name=pnet.name) else: return pnet