# Copyright (c) 2020 Horizon Robotics and ALF Contributors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
import numpy as np
from typing import Callable, Optional, Tuple, Union
import torch
import torch.nn as nn
from .containers import _Sequential
from .network import Network
import alf
import alf.layers as layers
from alf.initializers import variance_scaling_init
from alf.tensor_specs import TensorSpec
from alf.utils import common
from alf.nest.utils import get_outer_rank
[docs]@alf.configurable
class ImageEncodingNetwork(_Sequential):
"""
A general template class for creating convolutional encoding networks.
"""
def __init__(self,
input_channels,
input_size,
conv_layer_params,
same_padding=False,
activation=torch.relu_,
kernel_initializer=None,
flatten_output=False,
name="ImageEncodingNetwork"):
"""
Initialize the layers for encoding an image into a latent vector.
Currently there seems no need for this class to handle nested inputs;
If necessary, extend the argument list to support it in the future.
How to calculate the output size:
`<https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html>`_::
H = (H1 - HF + 2P) // strides + 1
where H = output size, H1 = input size, HF = size of kernel, P = padding.
Regarding padding: in the previous TF version, we have two padding modes:
``valid`` and ``same``. For the former, we always have no padding (P=0); for
the latter, it's also called "half padding" (P=(HF-1)//2 when strides=1
and HF is an odd number the output has the same size with the input.
Currently, PyTorch don't support different left and right paddings and
P is always (HF-1)//2. So if HF is an even number, the output size will
decrease by 1 when strides=1).
Args:
input_channels (int): number of channels in the input image
input_size (int or tuple): the input image size (height, width)
conv_layer_params (tuppe[tuple]): a non-empty tuple of
tuple (num_filters, kernel_size, strides, padding), where
padding is optional
same_padding (bool): similar to TF's conv2d ``same`` padding mode. If
True, the user provided paddings in `conv_layer_params` will be
replaced by automatically calculated ones; if False, it
corresponds to TF's ``valid`` padding mode (the user can still
provide custom paddings though)
activation (torch.nn.functional): activation for all the layers
kernel_initializer (Callable): initializer for all the layers.
flatten_output (bool): If False, the output will be an image
structure of shape ``BxCxHxW``; otherwise the output will be
flattened into a feature of shape ``BxN``.
"""
input_size = common.tuplify2d(input_size)
input_tensor_spec = TensorSpec((input_channels, ) + input_size)
assert isinstance(conv_layer_params, tuple)
assert len(conv_layer_params) > 0
nets = []
for paras in conv_layer_params:
filters, kernel_size, strides = paras[:3]
padding = paras[3] if len(paras) > 3 else 0
if same_padding: # overwrite paddings
kernel_size = common.tuplify2d(kernel_size)
padding = ((kernel_size[0] - 1) // 2,
(kernel_size[1] - 1) // 2)
nets.append(
layers.Conv2D(
input_channels,
filters,
kernel_size,
activation=activation,
kernel_initializer=kernel_initializer,
strides=strides,
padding=padding))
input_channels = filters
if flatten_output:
nets.append(alf.layers.Reshape((-1, )))
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]@alf.configurable
class ImageDecodingNetwork(_Sequential):
"""
A general template class for creating transposed convolutional decoding networks.
"""
def __init__(self,
input_size,
transconv_layer_params,
start_decoding_size,
start_decoding_channels,
same_padding=False,
preprocess_fc_layer_params=None,
activation=torch.relu_,
kernel_initializer=None,
output_activation=torch.tanh,
name="ImageDecodingNetwork"):
"""
Initialize the layers for decoding a latent vector into an image.
Currently there seems no need for this class to handle nested inputs;
If necessary, extend the argument list to support it in the future.
How to calculate the output size:
`<https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html>`_::
H = (H1-1) * strides + HF - 2P + OP
where H = output size, H1 = input size, HF = size of kernel, P = padding,
OP = output_padding (currently hardcoded to be 0 for this class).
Regarding padding: in the previous TF version, we have two padding modes:
``valid`` and ``same``. For the former, we always have no padding (P=0); for
the latter, it's also called ``half padding`` (P=(HF-1)//2 when strides=1
and HF is an odd number the output has the same size with the input.
Currently, PyTorch doesn't support different left and right paddings and
P is always (HF-1)//2. So if HF is an even number, the output size will
increaseby 1 when strides=1).
Args:
input_size (int): the size of the input latent vector
transconv_layer_params (tuple[tuple]): a non-empty
tuple of tuple (num_filters, kernel_size, strides, padding),
where ``padding`` is optional.
start_decoding_size (int or tuple): the initial height and width
we'd like to have for the feature map
start_decoding_channels (int): the initial number of channels we'd
like to have for the feature map. Note that we always first
project an input latent vector into a vector of an appropriate
length so that it can be reshaped into (``start_decoding_channels``,
``start_decoding_height``, ``start_decoding_width``).
same_padding (bool): similar to TF's conv2d ``same`` padding mode. If
True, the user provided paddings in ``transconv_layer_params`` will
be replaced by automatically calculated ones; if False, it
corresponds to TF's ``valid`` padding mode (the user can still
provide custom paddings though).
preprocess_fc_layer_params (tuple[int]): a tuple of fc
layer units. These fc layers are used for preprocessing the
latent vector before transposed convolutions.
activation (nn.functional): activation for hidden layers
kernel_initializer (Callable): initializer for all the layers.
output_activation (nn.functional): activation for the output layer.
Usually our image inputs are normalized to [0, 1] or [-1, 1],
so this function should be ``torch.sigmoid`` or
``torch.tanh``.
name (str):
"""
input_tensor_spec = TensorSpec((input_size, ))
assert isinstance(transconv_layer_params, tuple)
assert len(transconv_layer_params) > 0
nets = []
if preprocess_fc_layer_params is not None:
for size in preprocess_fc_layer_params:
nets.append(
layers.FC(
input_size,
size,
activation=activation,
kernel_initializer=kernel_initializer))
input_size = size
start_decoding_size = common.tuplify2d(start_decoding_size)
# pytorch assumes "channels_first" !
start_decoding_shape = [
start_decoding_channels, start_decoding_size[0],
start_decoding_size[1]
]
nets.append(
layers.FC(
input_size,
np.prod(start_decoding_shape),
activation=activation,
kernel_initializer=kernel_initializer))
nets.append(alf.layers.Reshape(start_decoding_shape))
in_channels = start_decoding_channels
for i, paras in enumerate(transconv_layer_params):
filters, kernel_size, strides = paras[:3]
padding = paras[3] if len(paras) > 3 else 0
if same_padding: # overwrite paddings
kernel_size = common.tuplify2d(kernel_size)
padding = ((kernel_size[0] - 1) // 2,
(kernel_size[1] - 1) // 2)
act = activation
if i == len(transconv_layer_params) - 1:
act = output_activation
nets.append(
layers.ConvTranspose2D(
in_channels,
filters,
kernel_size,
activation=act,
kernel_initializer=kernel_initializer,
strides=strides,
padding=padding))
in_channels = filters
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]@alf.configurable
class ImageDecodingNetworkV2(_Sequential):
"""Image decoding using upsampling+convolution.
Different with ``ImageDecodingNetwork`` which uses transposed convolution to
transform a smaller input to a larger image output, this class uses upsampling
followed by convolution layers. The idea is to let conv layer refine the
upsampling (e.g., nearest neighbor, bilinear, etc) results.
The difference between transposed conv and upsampling+conv can be found in
this article: `<https://distill.pub/2016/deconv-checkerboard/>`_. In short,
upsampling+conv might help reduce checkerboard artifacts that are common in
the outputs by transposed convolutions.
"""
def __init__(self,
input_size: int,
upsample_conv_layer_params: Tuple[Union[int, Tuple[int]]],
start_decoding_size: Union[int, Tuple[int]],
start_decoding_channels: int,
preprocess_fc_layer_params: Tuple[int] = None,
upsampling_mode: str = 'nearest',
same_padding: bool = False,
activation: Callable = torch.relu_,
kernel_initializer: Callable = None,
output_activation: Callable = torch.tanh,
name: str = "ImageDecodingNetworkV2"):
"""An example network of upsampling+conv for decoding images.
.. code-block:: python
net = ImageDecodingNetworkV2(input_size=100,
start_decoding_size=10,
start_decoding_channels=8,
same_padding=True,
upsample_conv_layer_params=(
2,
(16, 3, 1),
(32, 3, 1),
2,
(64, 3, 1),
(3, 3, 1)))
# The image shape: (8,10,10) -> (8,20,20) -> (16,20,20) -> (32,20,20)
# -> (32,40,40) -> (64,40,40) -> (3,40,40)
Args:
input_size: the size of the input latent vector
upsample_conv_layer_params: a tuple of ints or tuples. If the element
is an int, it represents the scaling factor for a ``torch.nn.Upsample``
layer; otherwise it should a tuple of ints representing conv params
``(num_filters, kernel_size, strides, padding)``,
where ``padding`` is optional.
start_decoding_size: the initial height and width we'd like to have
for the feature map.
start_decoding_channels: the initial number of channels we'd
like to have for the feature map. Note that we always first
project an input latent vector into a vector of an appropriate
length so that it can be reshaped into (``start_decoding_channels``,
``start_decoding_height``, ``start_decoding_width``).
preprocess_fc_layer_params: if not None, then the input will be fed
to a list of fc layers specified by this argument, before doing
deconvolution.
upsampling_mode: the argument for choosing an upsampling algorithm
for ``torch.nn.Upsample``.
same_padding: similar to TF's conv2d ``same`` padding mode. If
True, the user provided paddings in ``transconv_layer_params`` will
be replaced by automatically calculated ones; if False, it
corresponds to TF's ``valid`` padding mode (the user can still
provide custom paddings though). Please refer to the docstring of
``ImageEncodingNetwork`` for definitions of the two padding modes.
activation: activation for hidden layers
kernel_initializer: initializer for all the layers.
output_activation: activation for the output layer.
Usually our image inputs are normalized to [0, 1] or [-1, 1],
so this function should be ``torch.sigmoid`` or ``torch.tanh``.
name (str):
"""
input_tensor_spec = TensorSpec((input_size, ))
assert isinstance(upsample_conv_layer_params, tuple)
assert len(upsample_conv_layer_params) > 0
start_decoding_size = common.tuplify2d(start_decoding_size)
# pytorch assumes "channels_first" !
start_decoding_shape = [
start_decoding_channels, start_decoding_size[0],
start_decoding_size[1]
]
nets = []
if preprocess_fc_layer_params is not None:
for size in preprocess_fc_layer_params:
nets.append(
layers.FC(
input_size,
size,
activation=activation,
kernel_initializer=kernel_initializer))
input_size = size
nets.extend([
layers.FC(
input_size,
np.prod(start_decoding_shape),
activation=activation,
kernel_initializer=kernel_initializer),
alf.layers.Reshape(start_decoding_shape)
])
in_channels = start_decoding_channels
for i, paras in enumerate(upsample_conv_layer_params):
if isinstance(paras, int):
nets.append(
torch.nn.Upsample(
scale_factor=paras, mode=upsampling_mode))
else:
filters, kernel_size, strides = paras[:3]
padding = paras[3] if len(paras) > 3 else 0
if same_padding: # overwrite paddings
kernel_size = common.tuplify2d(kernel_size)
padding = ((kernel_size[0] - 1) // 2,
(kernel_size[1] - 1) // 2)
act = activation
if i == len(upsample_conv_layer_params) - 1:
act = output_activation
nets.append(
layers.Conv2D(
in_channels,
filters,
kernel_size,
activation=act,
kernel_initializer=kernel_initializer,
strides=strides,
padding=padding))
in_channels = filters
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs]def SpatialBroadcastDecodingNetwork(
input_size: int,
output_height: int,
conv_layer_params: Tuple[Tuple[int]],
output_width: int = None,
fc_layer_params: Tuple[int] = None,
activation: Callable = torch.relu_,
output_activation: Callable = alf.utils.math_ops.identity,
name: str = "SpatialBroadcastDecodingNetwork"):
"""Implements the spatial broadcast decoder in
`Watters et al. 2019,
Spatial Broadcast Decoder: A Simple Architecture for Learning Disentangled
Representations in VAEs <https://arxiv.org/abs/1901.07017>`_.
In short, given a latent embedding and target output height/width, this
decoder first spatially broadcast the embedding over ``height*width``, append
a uniform ``xy`` meshgrid in [-1,1], and apply conv layers.
Args:
input_size: the latent embedding size
output_height: the target output image height
conv_layer_params: a tuple of conv layer params after broadcasting
output_width: if None, it's equal to ``output_height``
fc_layer_params: a tuple of fc layers applied to the input embedding before
broadcasting
activation: activation of the intermediate conv layers
output_activation: the final activation
"""
input_tensor_spec = TensorSpec((input_size, ))
proj = alf.math.identity
if fc_layer_params is not None:
proj = EncodingNetwork(
input_tensor_spec=input_tensor_spec,
fc_layer_params=fc_layer_params,
activation=activation)
if output_width is None:
output_width = output_height
preproc_net = alf.nn.Sequential(
proj,
functools.partial(
alf.utils.tensor_utils.spatial_broadcast,
im_shape=(output_height, output_width)),
alf.utils.tensor_utils.append_coordinate,
input_tensor_spec=input_tensor_spec)
assert isinstance(conv_layer_params, tuple) and len(conv_layer_params) > 0
conv_net = ImageEncodingNetwork(
input_channels=preproc_net.output_spec.shape[0],
input_size=preproc_net.output_spec.shape[1:],
conv_layer_params=conv_layer_params[:-1],
same_padding=True,
activation=activation)
last_conv_net = ImageEncodingNetwork(
input_channels=conv_net.output_spec.shape[0],
input_size=conv_net.output_spec.shape[1:],
conv_layer_params=conv_layer_params[-1:],
same_padding=True,
activation=output_activation)
return alf.nn.Sequential(preproc_net, conv_net, last_conv_net, name=name)
[docs]@alf.configurable
class AutoShapeImageDeconvNetwork(_Sequential):
"""
A general template class for creating image deconv (transposed convolutional)
networks with auto-shape inference (thus named as
``AutoShapeImageDeconvNetwork``).
"""
def __init__(self,
input_size: int,
transconv_layer_params: Tuple,
output_shape: Tuple,
start_decoding_channels: int,
preprocess_fc_layer_params: Optional[Tuple] = None,
activation: Optional[Callable] = torch.relu_,
kernel_initializer: Optional[Callable] = None,
output_activation: Optional[Callable] = torch.tanh,
name="AutoShapeImageDeconvNetwork"):
"""
Auto-shape inference: instead of specifying an initial start shape for
image deconv, this class only needs to specify the desired output shape
for the image and will automatically calculate the desired shape to start
decoding based on the specified ``transconv_layer_params``
and uses a FC layer to map the to the desired start shape.
Args:
input_size (int): the size of the input latent vector
transconv_layer_params (tuple[tuple]): a non-empty
tuple of tuple (num_filters, kernel_size, strides, padding),
where ``padding`` is optional.
output_shape (tuple): the complete output size would be
output_shape = (c, h, w).
start_decoding_channels (int): the initial number of channels we'd
like to have for the feature map. Note that we always first
project an input latent vector into a vector of an appropriate
length so that it can be reshaped into (``start_decoding_channels``,
``start_decoding_height``, ``start_decoding_width``),
where ``start_decoding_height`` and ``start_decoding_width``
are automatically inferred based on the specified ``output_shape``
and ``transconv_layer_params``.
preprocess_fc_layer_params (tuple[int]): a tuple of fc
layer units. These fc layers are used for preprocessing the
latent vector before transposed convolutions.
activation (nn.functional): activation for hidden layers
kernel_initializer (Callable): initializer for all the layers.
output_activation (nn.functional): activation for the output layer.
Usually our image inputs are normalized to [0, 1] or [-1, 1],
so this function should be ``torch.sigmoid`` or
``torch.tanh``.
name (str):
"""
assert len(output_shape) == 3, "the output_shape should be (c, h, w)"
assert output_shape[0] == transconv_layer_params[-1][0], (
"channel number mis-match")
# compute conv shape and padding shape
out_paddings = []
out_shape = output_shape[1:]
for i, paras in enumerate(transconv_layer_params[::-1]):
filters, kernel_size, stride = paras[:3]
kernel_size = common.tuplify2d(kernel_size)
padding = paras[3] if len(paras) > 3 else 0
padding = common.tuplify2d(padding)
conv_shape = self._calc_conv_out_shape(out_shape, padding,
kernel_size, stride)
out_padding = self._calc_output_padding_shape(
out_shape, conv_shape, padding, kernel_size, stride)
out_shape = conv_shape
out_paddings.append(out_padding)
input_tensor_spec = TensorSpec((input_size, ))
assert isinstance(transconv_layer_params, tuple)
assert len(transconv_layer_params) > 0
nets = []
if preprocess_fc_layer_params is not None:
for size in preprocess_fc_layer_params:
nets.append(
layers.FC(
input_size,
size,
activation=activation,
kernel_initializer=kernel_initializer))
input_size = size
start_decoding_shape = [
start_decoding_channels, conv_shape[0], conv_shape[1]
]
nets.append(
layers.FC(
input_size,
np.prod(start_decoding_shape),
activation=activation,
kernel_initializer=kernel_initializer))
nets.append(alf.layers.Reshape(start_decoding_shape))
in_channels = start_decoding_channels
for i, paras in enumerate(transconv_layer_params):
filters, kernel_size, strides = paras[:3]
padding = paras[3] if len(paras) > 3 else 0
output_padding = out_paddings[-(i + 1)]
act = activation
if i == len(transconv_layer_params) - 1:
act = output_activation
nets.append(
layers.ConvTranspose2D(
in_channels,
filters,
kernel_size,
activation=act,
kernel_initializer=kernel_initializer,
strides=strides,
padding=padding,
output_padding=output_padding))
in_channels = filters
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
def _calc_conv_out_shape(self, input_size, padding, kernel_size, stride):
"""Calculate the output shape of a conv2d operation.
Reference:
`<https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html>`_.
"""
def _conv_out_1d(input_size, padding, kernel_size, stride):
return int((input_size + 2. * padding - kernel_size) / stride + 1.)
return tuple(
_conv_out_1d(x, p, k, stride)
for x, p, k in zip(input_size, padding, kernel_size))
def _calc_output_padding_shape(self, input_size, conv_out, padding,
kernel_size, stride):
"""Calculate the necessary output padding to be used for
``ConvTranspose2D`` to ensure the image obatained from it will have a
size that matches the ``input size``.
"""
def _output_padding_1d(input_size, conv_out, padding, kernel_size,
stride):
return input_size - (
conv_out - 1) * stride + 2 * padding - kernel_size
return tuple(_output_padding_1d(x, c, p, k, stride) for x, c, p, k in \
zip(input_size, conv_out, padding, kernel_size))
[docs]@alf.configurable
class EncodingNetwork(_Sequential):
"""Feed Forward network with CNN and FC layers which allows the last layer
to have different settings from the other layers.
"""
def __init__(self,
input_tensor_spec,
output_tensor_spec=None,
input_preprocessors=None,
preprocessing_combiner=None,
conv_layer_params=None,
fc_layer_params=None,
activation=torch.relu_,
kernel_initializer=None,
use_fc_bn=False,
last_layer_size=None,
last_activation=None,
last_kernel_initializer=None,
last_use_fc_bn=False,
name="EncodingNetwork"):
"""
Args:
input_tensor_spec (nested TensorSpec): the (nested) tensor spec of
the input. If nested, then ``preprocessing_combiner`` must not be
None.
output_tensor_spec (None|TensorSpec): spec for the output. If None,
the output tensor spec will be assumed as
``TensorSpec((output_size, ))``, where ``output_size`` is
inferred from network output. Otherwise, the output tensor
spec will be ``output_tensor_spec`` and the network output
will be reshaped according to ``output_tensor_spec``.
Note that ``output_tensor_spec`` is only used for reshaping
the network outputs for interpretation purpose and is not used
for specifying any network layers.
input_preprocessors (nested Network|nn.Module|None): a nest of
preprocessors, each of which will be applied to the
corresponding input. If not None, then it must have the same
structure with ``input_tensor_spec``. This arg is helpful if you
want to have separate preprocessings for different inputs by
configuring a gin file without changing the code. For example,
embedding a discrete input before concatenating it to another
continuous vector.
preprocessing_combiner (NestCombiner): preprocessing called on
complex inputs. Note that this combiner must also accept
``input_tensor_spec`` as the input to compute the processed
tensor spec. For example, see ``alf.nest.utils.NestConcat``. This
arg is helpful if you want to combine inputs by configuring a
gin file without changing the code.
conv_layer_params (tuple[tuple]): a tuple of tuples where each
tuple takes a format ``(filters, kernel_size, strides, padding)``,
where ``padding`` is optional.
fc_layer_params (tuple[int]): a tuple of integers
representing FC layer sizes.
activation (nn.functional): activation used for all the layers but
the last layer.
kernel_initializer (Callable): initializer for all the layers but
the last layer. If None, a variance_scaling_initializer will be
used.
use_fc_bn (bool): whether use Batch Normalization for fc layers.
last_layer_size (int): an optional size of an additional layer
appended at the very end. Note that if ``last_activation`` is
specified, ``last_layer_size`` has to be specified explicitly.
last_activation (nn.functional): activation function of the
additional layer specified by ``last_layer_size``. Note that if
``last_layer_size`` is not None, ``last_activation`` has to be
specified explicitly.
last_use_fc_bn (bool): whether use Batch Normalization for the last
fc layer.
last_kernel_initializer (Callable): initializer for the the
additional layer specified by ``last_layer_size``.
If None, it will be the same with ``kernel_initializer``. If
``last_layer_size`` is None, ``last_kernel_initializer`` will
not be used.
name (str):
"""
if kernel_initializer is None:
kernel_initializer = functools.partial(
variance_scaling_init,
mode='fan_in',
distribution='truncated_normal',
nonlinearity=activation)
spec = input_tensor_spec
nets = []
if input_preprocessors:
input_preprocessors = alf.nest.map_structure(
lambda p: alf.layers.Identity() if p is None else p,
input_preprocessors)
net = alf.nn.Parallel(input_preprocessors, input_tensor_spec)
spec = net.output_spec
nets.append(net)
if alf.nest.is_nested(spec):
assert preprocessing_combiner is not None, \
("When a nested input tensor spec is provided, an input " +
"preprocessing combiner must also be provided!")
spec = preprocessing_combiner(spec)
nets.append(preprocessing_combiner)
else:
assert isinstance(spec, TensorSpec), \
"The spec must be an instance of TensorSpec!"
if conv_layer_params:
assert isinstance(conv_layer_params, tuple), \
"The input params {} should be tuple".format(conv_layer_params)
assert len(spec.shape) == 3, \
"The input shape {} should be like (C,H,W)!".format(spec.shape)
input_channels, height, width = spec.shape
net = ImageEncodingNetwork(
input_channels, (height, width),
conv_layer_params,
activation=activation,
kernel_initializer=kernel_initializer,
flatten_output=True)
spec = net.output_spec
nets.append(net)
assert spec.ndim == 1, \
"The input shape {} should be like (N,)!".format(spec.shape)
input_size = spec.shape[0]
if fc_layer_params is None:
fc_layer_params = []
else:
assert isinstance(fc_layer_params, tuple)
fc_layer_params = list(fc_layer_params)
for size in fc_layer_params:
nets.append(
layers.FC(
input_size,
size,
activation=activation,
use_bn=use_fc_bn,
kernel_initializer=kernel_initializer))
input_size = size
if last_layer_size is not None or last_activation is not None:
assert last_layer_size is not None and last_activation is not None, \
"Both last_layer_size and last_activation need to be specified!"
if last_kernel_initializer is None:
common.warning_once(
"last_kernel_initializer is not specified "
"for the last layer of size {}.".format(last_layer_size))
last_kernel_initializer = kernel_initializer
nets.append(
layers.FC(
input_size,
last_layer_size,
activation=last_activation,
use_bn=last_use_fc_bn,
kernel_initializer=last_kernel_initializer))
input_size = last_layer_size
if output_tensor_spec is not None:
assert output_tensor_spec.numel == input_size, (
"network output "
"size {a} is inconsisent with specified out_tensor_spec "
"of size {b}".format(a=input_size, b=output_tensor_spec.numel))
nets.append(alf.layers.Reshape(output_tensor_spec.shape))
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs] def make_parallel(self, n: int, allow_non_parallel_input=False):
"""Make a parallelized version of ``module``.
A parallel network has ``n`` copies of network with the same structure but
different independently initialized parameters. The parallel network can
process a batch of the data with shape [batch_size, n, ...] using ``n``
networks with same structure.
TODO: remove ``allow_non_parallel_input``. This means to make parallel network
not to accept non-parallel input. It will make the logic more transparent.
Args:
n (int): the number of copies
allow_non_parallel_input (bool): if True, the returned network will
also accept non-parallel input with shape [batch_size, ...]. In
this case, the network will check whether the input is parallel
input. If not, the input will be automatically replicated ``n``
times at the beginning.
Returns:
the parallelized network.
"""
pnet = super().make_parallel(n)
if allow_non_parallel_input:
return _ReplicateInputForParallel(
self.input_tensor_spec, n, pnet, name=pnet.name)
else:
return pnet
class _ReplicateInputForParallel(Network):
def __init__(self, input_tensor_spec, n, pnet, name):
super().__init__(
input_tensor_spec, state_spec=pnet.state_spec, name=name)
self._input_tensor_spec = input_tensor_spec
self._n = n
self._pnet = pnet
def forward(self, inputs, state=()):
outer_rank = get_outer_rank(inputs, self._input_tensor_spec)
if outer_rank == 1:
inputs = alf.layers.make_parallel_input(inputs, self._n)
return self._pnet(inputs, state)
[docs]@alf.configurable
def ParallelEncodingNetwork(input_tensor_spec,
n,
output_tensor_spec=None,
input_preprocessors=None,
preprocessing_combiner=None,
conv_layer_params=None,
fc_layer_params=None,
activation=torch.relu_,
kernel_initializer=None,
use_fc_bn=False,
last_layer_size=None,
last_activation=None,
last_kernel_initializer=None,
last_use_fc_bn=False,
name="ParallelEncodingNetwork"):
"""Parallel encoding network which effectively runs ``n`` individual encoding
network simultaneuosl.
Args:
input_tensor_spec (nested TensorSpec): the (nested) tensor spec of
the input. If nested, then ``preprocessing_combiner`` must not be
None.
n (int): number of parallel networks
output_tensor_spec (None|TensorSpec): spec for the output, excluding
the dimension of paralle networks ``n``. If None, the output
tensor spec will be assumed as ``TensorSpec((n, output_size, ))``,
where ``output_size`` is inferred from network output.
Otherwise, the output tensor spec will be
``TensorSpec((n, *output_tensor_spec.shape))`` and
the network output will be reshaped accordingly.
Note that ``output_tensor_spec`` is only used for reshaping
the network outputs for interpretation purpose and is not used
for specifying any network layers.
input_preprocessors (None): must be ``None``.
preprocessing_combiner (NestCombiner): preprocessing called on
complex inputs. Note that this combiner must also accept
``input_tensor_spec`` as the input to compute the processed
tensor spec. For example, see ``alf.nest.utils.NestConcat``. This
arg is helpful if you want to combine inputs by configuring a
gin file without changing the code.
conv_layer_params (tuple[tuple]): a tuple of tuples where each
tuple takes a format ``(filters, kernel_size, strides, padding)``,
where ``padding`` is optional.
fc_layer_params (tuple[int]): a tuple of integers
representing FC layer sizes.
activation (nn.functional): activation used for all the layers but
the last layer.
kernel_initializer (Callable): initializer for all the layers but
the last layer. If None, a variance_scaling_initializer will be
used.
use_fc_bn (bool): whether use Batch Normalization for fc layers.
last_layer_size (int): an optional size of an additional layer
appended at the very end. Note that if ``last_activation`` is
specified, ``last_layer_size`` has to be specified explicitly.
last_activation (nn.functional): activation function of the
additional layer specified by ``last_layer_size``. Note that if
``last_layer_size`` is not None, ``last_activation`` has to be
specified explicitly.
last_kernel_initializer (Callable): initializer for the the
additional layer specified by ``last_layer_size``.
If None, it will be the same with ``kernel_initializer``. If
``last_layer_size`` is None, ``last_kernel_initializer`` will
not be used.
last_use_fc_bn (bool): whether use Batch Normalization for the last
fc layer.
name (str):
Returns:
the parallelized network
"""
net = EncodingNetwork(
input_tensor_spec=input_tensor_spec,
output_tensor_spec=output_tensor_spec,
input_preprocessors=input_preprocessors,
preprocessing_combiner=preprocessing_combiner,
conv_layer_params=conv_layer_params,
fc_layer_params=fc_layer_params,
activation=activation,
kernel_initializer=kernel_initializer,
use_fc_bn=use_fc_bn,
last_layer_size=last_layer_size,
last_activation=last_activation,
last_kernel_initializer=last_kernel_initializer,
last_use_fc_bn=last_use_fc_bn,
name=name)
return net.make_parallel(n, True)
[docs]@alf.configurable
class LSTMEncodingNetwork(_Sequential):
"""LSTM cells followed by an encoding network."""
def __init__(self,
input_tensor_spec,
output_tensor_spec=None,
input_preprocessors=None,
preprocessing_combiner=None,
conv_layer_params=None,
pre_fc_layer_params=None,
hidden_size=(100, ),
lstm_output_layers=-1,
post_fc_layer_params=None,
activation=torch.relu_,
kernel_initializer=None,
last_layer_size=None,
last_activation=None,
last_kernel_initializer=None,
name="LSTMEncodingNetwork"):
"""
Args:
input_tensor_spec (nested TensorSpec): the (nested) tensor spec of
the input. If nested, then ``preprocessing_combiner`` must not be
None.
output_tensor_spec (None|TensorSpec): spec for the output. If None,
the output tensor spec will be assumed as
``TensorSpec((output_size, ))``, where ``output_size`` is
inferred from network output. Otherwise, the output tensor
spec will be ``output_tensor_spec`` and the network output
will be reshaped according to ``output_tensor_spec``.
Note that ``output_tensor_spec`` is only used for reshaping
the network outputs for interpretation purpose and is not used
for specifying any network layers.
input_preprocessors (nested Network|nn.Module|None): a nest of
input preprocessors, each of which will be applied to the
corresponding input. If not None, then it must have the same
structure with ``input_tensor_spec``. This arg is helpful if you
want to have separate preprocessings for different inputs by
configuring a gin file without changing the code. For example,
embedding a discrete input before concatenating it to another
continuous vector.
preprocessing_combiner (NestCombiner): preprocessing called on
complex inputs. Note that this combiner must also accept
``input_tensor_spec`` as the input to compute the processed
tensor spec. For example, see ``alf.nest.utils.NestConcat``. This
arg is helpful if you want to combine inputs by configuring a
gin file without changing the code.
conv_layer_params (tuple[tuple]): a tuple of tuples where each
tuple takes a format ``(filters, kernel_size, strides, padding)``,
where ``padding`` is optional.
pre_fc_layer_params (tuple[int]): a tuple of integers
representing FC layers that are applied before the LSTM cells.
hidden_size (int or tuple[int]): the hidden size(s) of
the lstm cell(s). Each size corresponds to a cell. If there are
multiple sizes, then lstm cells are stacked.
lstm_output_layers (None|int|list[int]): -1 means the output from
the last lstm layer. ``None`` means all lstm layers.
post_fc_layer_params (tuple[int]): an optional tuple of
integers representing hidden FC layers that are applied after
the LSTM cells.
activation (nn.functional): activation for all the layers but the
last layer.
kernel_initializer (Callable): initializer for all the layers but
the last layer.
last_layer_size (int): an optional size of an additional layer
appended at the very end. Note that if ``last_activation`` is
specified, ``last_layer_size`` has to be specified explicitly.
last_activation (nn.functional): activation function of the
additional layer specified by ``last_layer_size``. Note that if
``last_layer_size`` is not None, ``last_activation`` has to be
specified explicitly.
last_kernel_initializer (Callable): initializer for the the
additional layer specified by ``last_layer_size``.
If None, it will be the same with ``kernel_initializer``. If
``last_layer_size`` is None, ``last_kernel_initializer`` will
not be used.
"""
nets = []
if (input_preprocessors or preprocessing_combiner or conv_layer_params
or pre_fc_layer_params):
net = EncodingNetwork(
input_tensor_spec=input_tensor_spec,
input_preprocessors=input_preprocessors,
preprocessing_combiner=preprocessing_combiner,
conv_layer_params=conv_layer_params,
fc_layer_params=pre_fc_layer_params,
activation=activation,
kernel_initializer=kernel_initializer)
input_size = net.output_spec.shape[0]
nets.append(net)
else:
input_size = input_tensor_spec.shape[0]
if isinstance(hidden_size, int):
hidden_size = [hidden_size]
else:
assert isinstance(hidden_size, tuple)
cells = []
for hs in hidden_size:
cells.append(
alf.nn.LSTMCell(input_size=input_size, hidden_size=hs))
input_size = hs
if lstm_output_layers is None:
lstm_output_layers = list(range(len(hidden_size)))
elif type(lstm_output_layers) == int:
lstm_output_layers = [lstm_output_layers]
lstm_output_layers = [
len(cells) + i if i < 0 else i for i in lstm_output_layers
]
if lstm_output_layers == [len(cells) - 1]:
nets.extend(cells)
else:
if type(lstm_output_layers) == int:
lstm_output_layers = [lstm_output_layers]
lstms = dict(('lstm%s' % i, cell) for i, cell in enumerate(cells))
lstms['o'] = (
tuple(
'lstm%s' % i
for i in lstm_output_layers), # the inputs for NestConcat
alf.layers.NestConcat())
nets.append(alf.nn.Sequential(**lstms, name='lstm_block'))
input_size = sum(hidden_size[i] for i in lstm_output_layers)
if post_fc_layer_params is not None or last_layer_size is not None:
net = EncodingNetwork(
input_tensor_spec=TensorSpec((input_size, )),
fc_layer_params=post_fc_layer_params,
activation=activation,
kernel_initializer=kernel_initializer,
last_layer_size=last_layer_size,
last_activation=last_activation,
last_kernel_initializer=last_kernel_initializer)
nets.append(net)
input_size = net.output_spec.numel
if output_tensor_spec is not None:
assert output_tensor_spec.numel == input_size, (
"network output "
"size {a} is inconsisent with specified out_tensor_spec "
"of size {b}".format(a=input_size, b=output_tensor_spec.numel))
nets.append(alf.layers.Reshape(output_tensor_spec.shape))
super().__init__(nets, input_tensor_spec=input_tensor_spec, name=name)
[docs] def make_parallel(self, n: int, allow_non_parallel_input=False):
"""Make a parallelized version of ``module``.
A parallel network has ``n`` copies of network with the same structure but
different independently initialized parameters. The parallel network can
process a batch of the data with shape [batch_size, n, ...] using ``n``
networks with same structure.
Args:
n (int): the number of copies
allow_non_parallel_input (bool): if True, the returned network will
also accept non-parallel input with shape [batch_size, ...]. In
this case, the network will check whether the input is parallel
input. If not, the input will be automatically replicated ``n``
times at the beginning.
Returns:
the parallelized network.
"""
pnet = super().make_parallel(n)
if allow_non_parallel_input:
return _ReplicateInputForParallel(
self.input_tensor_spec, n, pnet, name=pnet.name)
else:
return pnet