pytorch-image-models/timm/models/layers/global_context.py

""" Global Context Attention Block

Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond`
    - https://arxiv.org/abs/1904.11492

Official code consulted as reference: https://github.com/xvjiarui/GCNet

Hacked together by / Copyright 2021 Ross Wightman
"""
from torch import nn as nn
import torch.nn.functional as F

from .create_act import create_act_layer, get_act_layer
from .helpers import make_divisible
from .mlp import ConvMlp
from .norm import LayerNorm2d


class GlobalContext(nn.Module):

    def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False,
                 rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'):
        super(GlobalContext, self).__init__()
        act_layer = get_act_layer(act_layer)

        self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None

        if rd_channels is None:
            rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)
        if fuse_add:
            self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
        else:
            self.mlp_add = None
        if fuse_scale:
            self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)
        else:
            self.mlp_scale = None

        self.gate = create_act_layer(gate_layer)
        self.init_last_zero = init_last_zero
        self.reset_parameters()

    def reset_parameters(self):
        if self.conv_attn is not None:
            nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu')
        if self.mlp_add is not None:
            nn.init.zeros_(self.mlp_add.fc2.weight)

    def forward(self, x):
        B, C, H, W = x.shape

        if self.conv_attn is not None:
            attn = self.conv_attn(x).reshape(B, 1, H * W)  # (B, 1, H * W)
            attn = F.softmax(attn, dim=-1).unsqueeze(3)  # (B, 1, H * W, 1)
            context = x.reshape(B, C, H * W).unsqueeze(1) @ attn
            context = context.view(B, C, 1, 1)
        else:
            context = x.mean(dim=(2, 3), keepdim=True)

        if self.mlp_scale is not None:
            mlp_x = self.mlp_scale(context)
            x = x * self.gate(mlp_x)
        if self.mlp_add is not None:
            mlp_x = self.mlp_add(context)
            x = x + mlp_x

        return x
Add Gather-Excite and Global Context attn modules. Refactor existing SE-like attn for consistency and refactor byob/byoanet for less redundancy. 3 years ago			`""" Global Context Attention Block`

			Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond`
			`- https://arxiv.org/abs/1904.11492`

			`Official code consulted as reference: https://github.com/xvjiarui/GCNet`

			`Hacked together by / Copyright 2021 Ross Wightman`
			`"""`
			`from torch import nn as nn`
			`import torch.nn.functional as F`

			`from .create_act import create_act_layer, get_act_layer`
			`from .helpers import make_divisible`
			`from .mlp import ConvMlp`
			`from .norm import LayerNorm2d`


			`class GlobalContext(nn.Module):`

Add gc_efficientnetv2_rw_t weights (global context instead of SE attn). Add TF XL weights even though the fine-tuned ones don't validate that well. Change default arg for GlobalContext to use scal (mul) mode. 3 years ago			`def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False,`
Add Gather-Excite and Global Context attn modules. Refactor existing SE-like attn for consistency and refactor byob/byoanet for less redundancy. 3 years ago			`rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'):`
			`super(GlobalContext, self).__init__()`
			`act_layer = get_act_layer(act_layer)`

			`self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None`

			`if rd_channels is None:`
			`rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.)`
			`if fuse_add:`
			`self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)`
			`else:`
			`self.mlp_add = None`
			`if fuse_scale:`
			`self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d)`
			`else:`
			`self.mlp_scale = None`

			`self.gate = create_act_layer(gate_layer)`
			`self.init_last_zero = init_last_zero`
			`self.reset_parameters()`

			`def reset_parameters(self):`
			`if self.conv_attn is not None:`
			`nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu')`
			`if self.mlp_add is not None:`
			`nn.init.zeros_(self.mlp_add.fc2.weight)`

			`def forward(self, x):`
			`B, C, H, W = x.shape`

			`if self.conv_attn is not None:`
			`attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W)`
			`attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1)`
wip - pre-rebase 3 years ago			`context = x.reshape(B, C, H * W).unsqueeze(1) @ attn`
Add Gather-Excite and Global Context attn modules. Refactor existing SE-like attn for consistency and refactor byob/byoanet for less redundancy. 3 years ago			`context = context.view(B, C, 1, 1)`
			`else:`
			`context = x.mean(dim=(2, 3), keepdim=True)`

			`if self.mlp_scale is not None:`
			`mlp_x = self.mlp_scale(context)`
			`x = x * self.gate(mlp_x)`
			`if self.mlp_add is not None:`
			`mlp_x = self.mlp_add(context)`
			`x = x + mlp_x`

			`return x`