pytorch-image-models/timm/models/layers/cond_conv2d.py

""" PyTorch Conditionally Parameterized Convolution (CondConv)

Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
(https://arxiv.org/abs/1904.04971)

Hacked together by Ross Wightman
"""

import math
from functools import partial
import numpy as np
import torch
from torch import nn as nn
from torch.nn import functional as F

from .helpers import tup_pair
from .conv2d_same import conv2d_same
from .padding import get_padding_value


def get_condconv_initializer(initializer, num_experts, expert_shape):
    def condconv_initializer(weight):
        """CondConv initializer function."""
        num_params = np.prod(expert_shape)
        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
                weight.shape[1] != num_params):
            raise (ValueError(
                'CondConv variables must have shape [num_experts, num_params]'))
        for i in range(num_experts):
            initializer(weight[i].view(expert_shape))
    return condconv_initializer


class CondConv2d(nn.Module):
    """ Conditionally Parameterized Convolution
    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py

    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
    https://github.com/pytorch/pytorch/issues/17983
    """
    __constants__ = ['in_channels', 'out_channels', 'dynamic_padding']

    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
        super(CondConv2d, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = tup_pair(kernel_size)
        self.stride = tup_pair(stride)
        padding_val, is_padding_dynamic = get_padding_value(
            padding, kernel_size, stride=stride, dilation=dilation)
        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
        self.padding = tup_pair(padding_val)
        self.dilation = tup_pair(dilation)
        self.groups = groups
        self.num_experts = num_experts

        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight_num_param = 1
        for wd in self.weight_shape:
            weight_num_param *= wd
        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))

        if bias:
            self.bias_shape = (self.out_channels,)
            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        init_weight = get_condconv_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
        init_weight(self.weight)
        if self.bias is not None:
            fan_in = np.prod(self.weight_shape[1:])
            bound = 1 / math.sqrt(fan_in)
            init_bias = get_condconv_initializer(
                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
            init_bias(self.bias)

    def forward(self, x, routing_weights):
        B, C, H, W = x.shape
        weight = torch.matmul(routing_weights, self.weight)
        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight = weight.view(new_weight_shape)
        bias = None
        if self.bias is not None:
            bias = torch.matmul(routing_weights, self.bias)
            bias = bias.view(B * self.out_channels)
        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
        x = x.view(1, B * C, H, W)
        if self.dynamic_padding:
            out = conv2d_same(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        else:
            out = F.conv2d(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])

        # Literal port (from TF definition)
        # x = torch.split(x, 1, 0)
        # weight = torch.split(weight, 1, 0)
        # if self.bias is not None:
        #     bias = torch.matmul(routing_weights, self.bias)
        #     bias = torch.split(bias, 1, 0)
        # else:
        #     bias = [None] * B
        # out = []
        # for xi, wi, bi in zip(x, weight, bias):
        #     wi = wi.view(*self.weight_shape)
        #     if bi is not None:
        #         bi = bi.view(*self.bias_shape)
        #     out.append(self.conv_fn(
        #         xi, wi, bi, stride=self.stride, padding=self.padding,
        #         dilation=self.dilation, groups=self.groups))
        # out = torch.cat(out, 0)
        return out
Tweak some comments, add SKNet models with weights to sotabench, remove an unused branch 5 years ago			`""" PyTorch Conditionally Parameterized Convolution (CondConv)`

			`Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference`
			`(https://arxiv.org/abs/1904.04971)`
A bunch more layer reorg, splitting many layers into own files. Improve torchscript compatibility. 5 years ago
			`Hacked together by Ross Wightman`
			`"""`

			`import math`
			`from functools import partial`
			`import numpy as np`
			`import torch`
			`from torch import nn as nn`
			`from torch.nn import functional as F`

Layer refactoring continues, ResNet downsample rewrite for proper dilation in 3x3 and avg_pool cases * select_conv2d -> create_conv2d * added create_attn to create attention module from string/bool/module * factor padding helpers into own file, use in both conv2d_same and avg_pool2d_same * add some more test eca resnet variants * minor tweaks, naming, comments, consistency 5 years ago			`from .helpers import tup_pair`
Update EfficientNet feature extraction for EfficientDet. Add needed MaxPoolSame as well. 5 years ago			`from .conv2d_same import conv2d_same`
Monster commit, activation refactor, VoVNet, norm_act improvements, more * refactor activations into basic PyTorch, jit scripted, and memory efficient custom auto * implement hard-mish, better grad for hard-swish * add initial VovNet V1/V2 impl, fix #151 * VovNet and DenseNet first models to use NormAct layers (support BatchNormAct2d, EvoNorm, InplaceIABN) * Wrap IABN for any models that use it * make more models torchscript compatible (DPN, PNasNet, Res2Net, SelecSLS) and add tests 4 years ago			`from .padding import get_padding_value`
A bunch more layer reorg, splitting many layers into own files. Improve torchscript compatibility. 5 years ago

			`def get_condconv_initializer(initializer, num_experts, expert_shape):`
			`def condconv_initializer(weight):`
			`"""CondConv initializer function."""`
			`num_params = np.prod(expert_shape)`
			`if (len(weight.shape) != 2 or weight.shape[0] != num_experts or`
			`weight.shape[1] != num_params):`
			`raise (ValueError(`
			`'CondConv variables must have shape [num_experts, num_params]'))`
			`for i in range(num_experts):`
			`initializer(weight[i].view(expert_shape))`
			`return condconv_initializer`


			`class CondConv2d(nn.Module):`
Tweak some comments, add SKNet models with weights to sotabench, remove an unused branch 5 years ago			`""" Conditionally Parameterized Convolution`
A bunch more layer reorg, splitting many layers into own files. Improve torchscript compatibility. 5 years ago			`Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py`

			`Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:`
			`https://github.com/pytorch/pytorch/issues/17983`
			`"""`
Add set_layer_config contextmgr to adjust all layer configs at once, use in create_module with new args. Remove a few old warning causing constant annotations for jit. 4 years ago			`__constants__ = ['in_channels', 'out_channels', 'dynamic_padding']`
A bunch more layer reorg, splitting many layers into own files. Improve torchscript compatibility. 5 years ago
			`def __init__(self, in_channels, out_channels, kernel_size=3,`
			`stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):`
			`super(CondConv2d, self).__init__()`

			`self.in_channels = in_channels`
			`self.out_channels = out_channels`
			`self.kernel_size = tup_pair(kernel_size)`
			`self.stride = tup_pair(stride)`
			`padding_val, is_padding_dynamic = get_padding_value(`
			`padding, kernel_size, stride=stride, dilation=dilation)`
			`self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript`
			`self.padding = tup_pair(padding_val)`
			`self.dilation = tup_pair(dilation)`
			`self.groups = groups`
			`self.num_experts = num_experts`

			`self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size`
			`weight_num_param = 1`
			`for wd in self.weight_shape:`
			`weight_num_param *= wd`
			`self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))`

			`if bias:`
			`self.bias_shape = (self.out_channels,)`
			`self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))`
			`else:`
			`self.register_parameter('bias', None)`

			`self.reset_parameters()`

			`def reset_parameters(self):`
			`init_weight = get_condconv_initializer(`
			`partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)`
			`init_weight(self.weight)`
			`if self.bias is not None:`
			`fan_in = np.prod(self.weight_shape[1:])`
			`bound = 1 / math.sqrt(fan_in)`
			`init_bias = get_condconv_initializer(`
			`partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)`
			`init_bias(self.bias)`

			`def forward(self, x, routing_weights):`
			`B, C, H, W = x.shape`
			`weight = torch.matmul(routing_weights, self.weight)`
			`new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size`
			`weight = weight.view(new_weight_shape)`
			`bias = None`
			`if self.bias is not None:`
			`bias = torch.matmul(routing_weights, self.bias)`
			`bias = bias.view(B * self.out_channels)`
			`# move batch elements with channels so each batch element can be efficiently convolved with separate kernel`
			`x = x.view(1, B * C, H, W)`
			`if self.dynamic_padding:`
			`out = conv2d_same(`
			`x, weight, bias, stride=self.stride, padding=self.padding,`
			`dilation=self.dilation, groups=self.groups * B)`
			`else:`
			`out = F.conv2d(`
			`x, weight, bias, stride=self.stride, padding=self.padding,`
			`dilation=self.dilation, groups=self.groups * B)`
			`out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])`

			`# Literal port (from TF definition)`
			`# x = torch.split(x, 1, 0)`
			`# weight = torch.split(weight, 1, 0)`
			`# if self.bias is not None:`
			`# bias = torch.matmul(routing_weights, self.bias)`
			`# bias = torch.split(bias, 1, 0)`
			`# else:`
			`# bias = [None] * B`
			`# out = []`
			`# for xi, wi, bi in zip(x, weight, bias):`
			`# wi = wi.view(*self.weight_shape)`
			`# if bi is not None:`
			`# bi = bi.view(*self.bias_shape)`
			`# out.append(self.conv_fn(`
			`# xi, wi, bi, stride=self.stride, padding=self.padding,`
			`# dilation=self.dilation, groups=self.groups))`
			`# out = torch.cat(out, 0)`
			`return out`