Significant norm update

* ConvBnAct layer renamed -> ConvNormAct and ConvNormActAa for anti-aliased * Significant update to EfficientNet and MobileNetV3 arch to support NormAct layers and grouped conv (as alternative to depthwise) * Update RegNet to add Z variant * Add Pre variant of XceptionAligned that works with NormAct layers * EvoNorm matches bits_and_tpu branch for merge
4 years ago · ab49d275de
parent d04f2f1377
commit ab49d275de
31 changed files with 955 additions and 523 deletions
--- a/timm/models/byobnet.py
+++ b/timm/models/byobnet.py
@ -34,8 +34,8 @@ import torch.nn as nn

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg, named_apply
-from .layers import ClassifierHead, ConvBnAct, BatchNormAct2d, DropPath, AvgPool2dSame, \
-    create_conv2d, get_act_layer, convert_norm_act, get_attn, make_divisible, to_2tuple, EvoNorm2dS0, EvoNorm2dS0a,\
+from .layers import ClassifierHead, ConvNormAct, BatchNormAct2d, DropPath, AvgPool2dSame, \
+    create_conv2d, get_act_layer, get_norm_act_layer, get_attn, make_divisible, to_2tuple, EvoNorm2dS0, EvoNorm2dS0a,\
    EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a, FilterResponseNormAct2d, FilterResponseNormTlu2d
 from .registry import register_model

@ -921,7 +921,7 @@ def num_groups(group_size, channels):

@dataclass
 class LayerFn:
-    conv_norm_act: Callable = ConvBnAct
+    conv_norm_act: Callable = ConvNormAct
    norm_act: Callable = BatchNormAct2d
    act: Callable = nn.ReLU
    attn: Optional[Callable] = None
@ -978,7 +978,7 @@ class BasicBlock(nn.Module):
        self.conv1_kxk = layers.conv_norm_act(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0])
        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
        self.conv2_kxk = layers.conv_norm_act(
-            mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_block=drop_block, apply_act=False)
+            mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_layer=drop_block, apply_act=False)
        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
@ -1019,11 +1019,9 @@ class BottleneckBlock(nn.Module):

        self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1)
        self.conv2_kxk = layers.conv_norm_act(
-            mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups, drop_block=drop_block)
+            mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block)
        if extra_conv:
-            self.conv2b_kxk = layers.conv_norm_act(
-                mid_chs, mid_chs, kernel_size, dilation=dilation[1], groups=groups, drop_block=drop_block)
+            self.conv2b_kxk = layers.conv_norm_act(mid_chs, mid_chs, kernel_size, dilation=dilation[1], groups=groups)
        else:
            self.conv2b_kxk = nn.Identity()
        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
@ -1080,7 +1078,7 @@ class DarkBlock(nn.Module):
        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
        self.conv2_kxk = layers.conv_norm_act(
            mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups,  drop_block=drop_block, apply_act=False)
+            groups=groups, drop_layer=drop_block, apply_act=False)
        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
@ -1127,8 +1125,7 @@ class EdgeBlock(nn.Module):
            apply_act=False, layers=layers)

        self.conv1_kxk = layers.conv_norm_act(
-            in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups,  drop_block=drop_block)
+            in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block)
        self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs)
        self.conv2_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False)
        self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs)
@ -1172,7 +1169,7 @@ class RepVggBlock(nn.Module):
        self.identity = layers.norm_act(out_chs, apply_act=False) if use_ident else None
        self.conv_kxk = layers.conv_norm_act(
            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
-            groups=groups, drop_block=drop_block, apply_act=False)
+            groups=groups, drop_layer=drop_block, apply_act=False)
        self.conv_1x1 = layers.conv_norm_act(in_chs, out_chs, 1, stride=stride, groups=groups, apply_act=False)
        self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs)
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity()
@ -1219,7 +1216,7 @@ class SelfAttnBlock(nn.Module):
        if extra_conv:
            self.conv2_kxk = layers.conv_norm_act(
                mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
-                groups=groups, drop_block=drop_block)
+                groups=groups, drop_layer=drop_block)
            stride = 1  # striding done via conv if enabled
        else:
            self.conv2_kxk = nn.Identity()
@ -1466,8 +1463,8 @@ def create_byob_stages(

 def get_layer_fns(cfg: ByoModelCfg):
    act = get_act_layer(cfg.act_layer)
-    norm_act = convert_norm_act(norm_layer=cfg.norm_layer, act_layer=act)
-    conv_norm_act = partial(ConvBnAct, norm_layer=cfg.norm_layer, act_layer=act)
+    norm_act = get_norm_act_layer(norm_layer=cfg.norm_layer, act_layer=act)
+    conv_norm_act = partial(ConvNormAct, norm_layer=cfg.norm_layer, act_layer=act)
    attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
    self_attn = partial(get_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None
    layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn)
--- a/timm/models/cspnet.py
+++ b/timm/models/cspnet.py
@ -14,11 +14,10 @@ Hacked together by / Copyright 2020 Ross Wightman
 """
 import torch
 import torch.nn as nn
-import torch.nn.functional as F

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, DropPath, create_attn, get_norm_act_layer
+from .layers import ClassifierHead, ConvNormAct, ConvNormActAa, DropPath, create_attn, get_norm_act_layer
 from .registry import register_model


@ -130,7 +129,7 @@ model_cfgs = dict(

 def create_stem(
        in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
-        act_layer=None, norm_layer=None, aa_layer=None):
+        act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
    stem = nn.Sequential()
    if not isinstance(out_chs, (tuple, list)):
        out_chs = [out_chs]
@ -138,7 +137,7 @@ def create_stem(
    in_c = in_chans
    for i, out_c in enumerate(out_chs):
        conv_name = f'conv{i + 1}'
-        stem.add_module(conv_name, ConvBnAct(
+        stem.add_module(conv_name, ConvNormAct(
            in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
            act_layer=act_layer, norm_layer=norm_layer))
        in_c = out_c
@ -161,12 +160,14 @@ class ResBottleneck(nn.Module):
                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
        super(ResBottleneck, self).__init__()
        mid_chs = int(round(out_chs * bottle_ratio))
-        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)

-        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
-        self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
+        self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvNormActAa(
+            mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups,
+            aa_layer=aa_layer, drop_layer=drop_block, **ckwargs)
        self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
-        self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
+        self.conv3 = ConvNormAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
        self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
        self.drop_path = drop_path
        self.act3 = act_layer(inplace=True)
@ -201,9 +202,11 @@ class DarkBlock(nn.Module):
                 drop_block=None, drop_path=None):
        super(DarkBlock, self).__init__()
        mid_chs = int(round(out_chs * bottle_ratio))
-        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
-        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
-        self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvNormActAa(
+            mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups,
+            aa_layer=aa_layer, drop_layer=drop_block, **ckwargs)
        self.attn = create_attn(attn_layer, channels=out_chs)
        self.drop_path = drop_path

@ -235,7 +238,7 @@ class CrossStage(nn.Module):
        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))

        if stride != 1 or first_dilation != dilation:
-            self.conv_down = ConvBnAct(
+            self.conv_down = ConvNormActAa(
                in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
                aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
            prev_chs = down_chs
@ -246,7 +249,7 @@ class CrossStage(nn.Module):
        # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
        # there is also special case for the first stage for some of the model that results in uneven split
        # across the two paths. I did it this way for simplicity for now.
-        self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs)
+        self.conv_exp = ConvNormAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs)
        prev_chs = exp_chs // 2  # output of conv_exp is always split in two

        self.blocks = nn.Sequential()
@ -257,8 +260,8 @@ class CrossStage(nn.Module):
            prev_chs = block_out_chs

        # transition convs
-        self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
-        self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+        self.conv_transition_b = ConvNormAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvNormAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)

    def forward(self, x):
        if self.conv_down is not None:
@ -280,7 +283,7 @@ class DarkStage(nn.Module):
        super(DarkStage, self).__init__()
        first_dilation = first_dilation or dilation

-        self.conv_down = ConvBnAct(
+        self.conv_down = ConvNormActAa(
            in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
            act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
            aa_layer=block_kwargs.get('aa_layer', None))
@ -437,7 +440,7 @@ def cspresnext50(pretrained=False, **kwargs):

@register_model
 def cspresnext50_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
+    norm_layer = get_norm_act_layer('iabn', act_layer='leaky_relu')
    return _create_cspnet('cspresnext50_iabn', pretrained=pretrained, norm_layer=norm_layer, **kwargs)


@ -448,7 +451,7 @@ def cspdarknet53(pretrained=False, **kwargs):

@register_model
 def cspdarknet53_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
+    norm_layer = get_norm_act_layer('iabn', act_layer='leaky_relu')
    return _create_cspnet('cspdarknet53_iabn', pretrained=pretrained, block_fn=DarkBlock, norm_layer=norm_layer, **kwargs)


--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@ -14,7 +14,7 @@ from torch.jit.annotations import List

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import BatchNormAct2d, create_norm_act, BlurPool2d, create_classifier
+from .layers import BatchNormAct2d, create_norm_act_layer, BlurPool2d, create_classifier
 from .registry import register_model

 __all__ = ['DenseNet']
@ -370,7 +370,7 @@ def densenet264d_iabn(pretrained=False, **kwargs):
    r"""Densenet-264 model with deep stem and Inplace-ABN
    """
    def norm_act_fn(num_features, **kwargs):
-        return create_norm_act('iabn', num_features, **kwargs)
+        return create_norm_act_layer('iabn', num_features, act_layer='leaky_relu', **kwargs)
    model = _create_densenet(
        'densenet264d_iabn', growth_rate=48, block_config=(6, 12, 64, 48), stem_type='deep',
        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@ -16,7 +16,7 @@ import torch.nn.functional as F

 from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import BatchNormAct2d, ConvBnAct, create_conv2d, create_classifier
+from .layers import BatchNormAct2d, ConvNormAct, create_conv2d, create_classifier
 from .registry import register_model

 __all__ = ['DPN']
@ -180,7 +180,7 @@ class DPN(nn.Module):
        blocks = OrderedDict()

        # conv1
-        blocks['conv1_1'] = ConvBnAct(
+        blocks['conv1_1'] = ConvNormAct(
            in_chans, num_init_features, kernel_size=3 if small else 7, stride=2, norm_layer=norm_layer)
        blocks['conv1_pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.feature_info = [dict(num_chs=num_init_features, reduction=2, module='features.conv1_1')]
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -45,7 +45,7 @@ from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficien
    round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .features import FeatureInfo, FeatureHooks
 from .helpers import build_model_with_cfg, default_cfg_for_features
-from .layers import create_conv2d, create_classifier
+from .layers import create_conv2d, create_classifier, get_norm_act_layer, EvoNorm2dS0, GroupNormAct
 from .registry import register_model

 __all__ = ['EfficientNet', 'EfficientNetFeatures']
@ -117,6 +117,20 @@ default_cfgs = {
    'efficientnet_l2': _cfg(
        url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),

+    # FIXME experimental
+    'efficientnet_b0_gn': _cfg(
+        url=''),
+    'efficientnet_b0_g8': _cfg(
+        url=''),
+    'efficientnet_b0_g16_evos': _cfg(
+        url=''),
+    'efficientnet_b3_gn': _cfg(
+        url='',
+        input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+    'efficientnet_b3_g8_gn': _cfg(
+        url='',
+        input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0),
+
    'efficientnet_es': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'),
    'efficientnet_em': _cfg(
@ -431,6 +445,7 @@ class EfficientNet(nn.Module):
        super(EfficientNet, self).__init__()
        act_layer = act_layer or nn.ReLU
        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
        se_layer = se_layer or SqueezeExcite
        self.num_classes = num_classes
        self.num_features = num_features
@ -440,8 +455,7 @@ class EfficientNet(nn.Module):
        if not fix_stem:
            stem_size = round_chs_fn(stem_size)
        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size)
-        self.act1 = act_layer(inplace=True)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
@ -453,17 +467,16 @@ class EfficientNet(nn.Module):

        # Head + Pooling
        self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type)
-        self.bn2 = norm_layer(self.num_features)
-        self.act2 = act_layer(inplace=True)
+        self.bn2 = norm_act_layer(self.num_features, inplace=True)
        self.global_pool, self.classifier = create_classifier(
            self.num_features, self.num_classes, pool_type=global_pool)

        efficientnet_init_weights(self)

    def as_sequential(self):
-        layers = [self.conv_stem, self.bn1, self.act1]
+        layers = [self.conv_stem, self.bn1]
        layers.extend(self.blocks)
-        layers.extend([self.conv_head, self.bn2, self.act2, self.global_pool])
+        layers.extend([self.conv_head, self.bn2, self.global_pool])
        layers.extend([nn.Dropout(self.drop_rate), self.classifier])
        return nn.Sequential(*layers)

@ -478,11 +491,9 @@ class EfficientNet(nn.Module):
    def forward_features(self, x):
        x = self.conv_stem(x)
        x = self.bn1(x)
-        x = self.act1(x)
        x = self.blocks(x)
        x = self.conv_head(x)
        x = self.bn2(x)
-        x = self.act2(x)
        return x

    def forward(self, x):
@ -506,6 +517,7 @@ class EfficientNetFeatures(nn.Module):
        super(EfficientNetFeatures, self).__init__()
        act_layer = act_layer or nn.ReLU
        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
        se_layer = se_layer or SqueezeExcite
        self.drop_rate = drop_rate

@ -513,8 +525,7 @@ class EfficientNetFeatures(nn.Module):
        if not fix_stem:
            stem_size = round_chs_fn(stem_size)
        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size)
-        self.act1 = act_layer(inplace=True)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
@ -536,7 +547,6 @@ class EfficientNetFeatures(nn.Module):
    def forward(self, x) -> List[torch.Tensor]:
        x = self.conv_stem(x)
        x = self.bn1(x)
-        x = self.act1(x)
        if self.feature_hooks is None:
            features = []
            if 0 in self._stage_out_idx:
@ -767,7 +777,9 @@ def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
    return model


-def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+def _gen_efficientnet(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, channel_divisor=8,
+        group_size=None, pretrained=False, **kwargs):
    """Creates an EfficientNet model.

    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
@ -800,9 +812,9 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
        ['ir_r4_k5_s2_e6_c192_se0.25'],
        ['ir_r1_k3_s1_e6_c320_se0.25'],
    ]
-    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
+    round_chs_fn = partial(round_channels, multiplier=channel_multiplier, divisor=channel_divisor)
    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
        num_features=round_chs_fn(1280),
        stem_size=32,
        round_chs_fn=round_chs_fn,
@ -814,7 +826,8 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
    return model


-def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+def _gen_efficientnet_edge(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, pretrained=False, **kwargs):
    """ Creates an EfficientNet-EdgeTPU model

    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
@ -832,7 +845,7 @@ def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0
    ]
    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
        num_features=round_chs_fn(1280),
        stem_size=32,
        round_chs_fn=round_chs_fn,
@ -946,7 +959,7 @@ def _gen_efficientnetv2_base(


 def _gen_efficientnetv2_s(
-        variant, channel_multiplier=1.0, depth_multiplier=1.0, rw=False, pretrained=False, **kwargs):
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, rw=False, pretrained=False, **kwargs):
    """ Creates an EfficientNet-V2 Small model

    Ref impl: https://github.com/google/automl/tree/master/efficientnetv2
@ -972,7 +985,7 @@ def _gen_efficientnetv2_s(

    round_chs_fn = partial(round_channels, multiplier=channel_multiplier)
    model_kwargs = dict(
-        block_args=decode_arch_def(arch_def, depth_multiplier),
+        block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size),
        num_features=round_chs_fn(num_features),
        stem_size=24,
        round_chs_fn=round_chs_fn,
@ -1366,6 +1379,52 @@ def efficientnet_l2(pretrained=False, **kwargs):
    return model


+# FIXME experimental group cong / GroupNorm / EvoNorm experiments
+@register_model
+def efficientnet_b0_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B0 + GroupNorm"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_gn', norm_layer=partial(GroupNormAct, group_size=8), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b0_g8(pretrained=False, **kwargs):
+    """ EfficientNet-B0 w/ group conv + BN"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_g8', group_size=8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b0_g16_evos(pretrained=False, **kwargs):
+    """ EfficientNet-B0 w/ group 16 conv + EvoNorm"""
+    model = _gen_efficientnet(
+        'efficientnet_b0_g16_evos', group_size=16, channel_divisor=16,
+        norm_layer=partial(EvoNorm2dS0, group_size=16), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b3_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B3 w/ GroupNorm """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3_gn', channel_multiplier=1.2, depth_multiplier=1.4, channel_divisor=16,
+        norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def efficientnet_b3_g8_gn(pretrained=False, **kwargs):
+    """ EfficientNet-B3 w/ grouped conv + BN"""
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3_g8_gn', channel_multiplier=1.2, depth_multiplier=1.4, group_size=8, channel_divisor=16,
+        norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs)
+    return model
+
+
@register_model
 def efficientnet_es(pretrained=False, **kwargs):
    """ EfficientNet-Edge Small. """
@ -1373,6 +1432,7 @@ def efficientnet_es(pretrained=False, **kwargs):
        'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
    return model

+
@register_model
 def efficientnet_es_pruned(pretrained=False, **kwargs):
    """ EfficientNet-Edge Small Pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0"""
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -2,18 +2,31 @@

 Hacked together by / Copyright 2020 Ross Wightman
 """
+import math

 import torch
 import torch.nn as nn
 from torch.nn import functional as F

-from .layers import create_conv2d, drop_path, make_divisible, create_act_layer
-from .layers.activations import sigmoid
+from .layers import create_conv2d, DropPath, make_divisible, create_act_layer, get_norm_act_layer

 __all__ = [
    'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual']


+def num_groups(group_size, channels):
+    if not group_size:  # 0 or None
+        return 1  # normal conv with 1 group
+    else:
+        # NOTE group_size == 1 -> depthwise conv
+        #assert channels % group_size == 0
+        if channels % group_size != 0:
+            num_groups = math.floor(channels / group_size)
+            print(channels, group_size, num_groups)
+            return int(num_groups)
+        return channels // group_size
+
+
 class SqueezeExcite(nn.Module):
    """ Squeeze-and-Excitation w/ specific features for EfficientNet/MobileNet family

@ -51,31 +64,30 @@ class ConvBnAct(nn.Module):
    """ Conv + Norm Layer + Activation w/ optional skip connection
    """
    def __init__(
-            self, in_chs, out_chs, kernel_size, stride=1, dilation=1, pad_type='',
+            self, in_chs, out_chs, kernel_size, stride=1, dilation=1, group_size=0, pad_type='',
            skip=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_path_rate=0.):
        super(ConvBnAct, self).__init__()
-        self.has_residual = skip and stride == 1 and in_chs == out_chs
-        self.drop_path_rate = drop_path_rate
-        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn1 = norm_layer(out_chs)
-        self.act1 = act_layer(inplace=True)
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = skip and stride == 1 and in_chs == out_chs
+
+        self.conv = create_conv2d(
+            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+        self.bn1 = norm_act_layer(out_chs, inplace=True)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()

    def feature_info(self, location):
        if location == 'expansion':  # output of conv after act, same as block coutput
-            info = dict(module='act1', hook_type='forward', num_chs=self.conv.out_channels)
+            return dict(module='bn1', hook_type='forward', num_chs=self.conv.out_channels)
        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv.out_channels)
-        return info
+            return dict(module='', hook_type='', num_chs=self.conv.out_channels)

    def forward(self, x):
        shortcut = x
        x = self.conv(x)
        x = self.bn1(x)
-        x = self.act1(x)
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += shortcut
+        if self.has_skip:
+            x = x + self.drop_path(shortcut)
        return x


@ -85,50 +97,41 @@ class DepthwiseSeparableConv(nn.Module):
    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
    """
    def __init__(
-            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='',
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
            noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
            se_layer=None, drop_path_rate=0.):
        super(DepthwiseSeparableConv, self).__init__()
-        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip
        self.has_pw_act = pw_act  # activation after point-wise conv
-        self.drop_path_rate = drop_path_rate

        self.conv_dw = create_conv2d(
-            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
-        self.bn1 = norm_layer(in_chs)
-        self.act1 = act_layer(inplace=True)
+            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups)
+        self.bn1 = norm_act_layer(in_chs, inplace=True)

        # Squeeze-and-excitation
        self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity()

        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_layer(out_chs)
-        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
+        self.bn2 = norm_act_layer(out_chs, inplace=True, apply_act=self.has_pw_act)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()

    def feature_info(self, location):
        if location == 'expansion':  # after SE, input to PW
-            info = dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
+            return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pw.out_channels)
-        return info
+            return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels)

    def forward(self, x):
        shortcut = x
-
        x = self.conv_dw(x)
        x = self.bn1(x)
-        x = self.act1(x)
-
        x = self.se(x)
-
        x = self.conv_pw(x)
        x = self.bn2(x)
-        x = self.act2(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += shortcut
+        if self.has_skip:
+            x = x + self.drop_path(shortcut)
        return x


@ -143,66 +146,51 @@ class InvertedResidual(nn.Module):
    """

    def __init__(
-            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='',
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
            noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU,
            norm_layer=nn.BatchNorm2d, se_layer=None, conv_kwargs=None, drop_path_rate=0.):
        super(InvertedResidual, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
        conv_kwargs = conv_kwargs or {}
        mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_path_rate = drop_path_rate
+        groups = num_groups(group_size, mid_chs)
+        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip

        # Point-wise expansion
        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn1 = norm_layer(mid_chs)
-        self.act1 = act_layer(inplace=True)
+        self.bn1 = norm_act_layer(mid_chs, inplace=True)

        # Depth-wise convolution
        self.conv_dw = create_conv2d(
            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
-            padding=pad_type, depthwise=True, **conv_kwargs)
-        self.bn2 = norm_layer(mid_chs)
-        self.act2 = act_layer(inplace=True)
+            groups=groups, padding=pad_type, **conv_kwargs)
+        self.bn2 = norm_act_layer(mid_chs, inplace=True)

        # Squeeze-and-excitation
        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()

        # Point-wise linear projection
        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
-        self.bn3 = norm_layer(out_chs)
+        self.bn3 = norm_act_layer(out_chs, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()

    def feature_info(self, location):
        if location == 'expansion':  # after SE, input to PWL
-            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
-        return info
+            return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)

    def forward(self, x):
        shortcut = x
-
-        # Point-wise expansion
        x = self.conv_pw(x)
        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Depth-wise convolution
        x = self.conv_dw(x)
        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Squeeze-and-excitation
        x = self.se(x)
-
-        # Point-wise linear projection
        x = self.conv_pwl(x)
        x = self.bn3(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += shortcut
-
+        if self.has_skip:
+            x = x + self.drop_path(shortcut)
        return x


@ -210,7 +198,7 @@ class CondConvResidual(InvertedResidual):
    """ Inverted residual block w/ CondConv routing"""

    def __init__(
-            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, pad_type='',
+            self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='',
            noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU,
            norm_layer=nn.BatchNorm2d, se_layer=None, num_experts=0, drop_path_rate=0.):

@ -218,8 +206,8 @@ class CondConvResidual(InvertedResidual):
        conv_kwargs = dict(num_experts=self.num_experts)

        super(CondConvResidual, self).__init__(
-            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type,
-            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, group_size=group_size,
+            pad_type=pad_type, act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
            pw_kernel_size=pw_kernel_size, se_layer=se_layer, norm_layer=norm_layer, conv_kwargs=conv_kwargs,
            drop_path_rate=drop_path_rate)

@ -227,32 +215,17 @@ class CondConvResidual(InvertedResidual):

    def forward(self, x):
        shortcut = x
-
-        # CondConv routing
-        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)  # CondConv routing
        routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
-
-        # Point-wise expansion
        x = self.conv_pw(x, routing_weights)
        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Depth-wise convolution
        x = self.conv_dw(x, routing_weights)
        x = self.bn2(x)
-        x = self.act2(x)
-
-        # Squeeze-and-excitation
        x = self.se(x)
-
-        # Point-wise linear projection
        x = self.conv_pwl(x, routing_weights)
        x = self.bn3(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += shortcut
+        if self.has_skip:
+            x = x + self.drop_path(shortcut)
        return x


@ -269,55 +242,44 @@ class EdgeResidual(nn.Module):
    """

    def __init__(
-            self, in_chs, out_chs, exp_kernel_size=3, stride=1, dilation=1, pad_type='',
+            self, in_chs, out_chs, exp_kernel_size=3, stride=1, dilation=1, group_size=0, pad_type='',
            force_in_chs=0, noskip=False, exp_ratio=1.0, pw_kernel_size=1, act_layer=nn.ReLU,
            norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.):
        super(EdgeResidual, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
        if force_in_chs > 0:
            mid_chs = make_divisible(force_in_chs * exp_ratio)
        else:
            mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_path_rate = drop_path_rate
+        groups = num_groups(group_size, in_chs)
+        self.has_skip = (in_chs == out_chs and stride == 1) and not noskip

        # Expansion convolution
        self.conv_exp = create_conv2d(
-            in_chs, mid_chs, exp_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
-        self.bn1 = norm_layer(mid_chs)
-        self.act1 = act_layer(inplace=True)
+            in_chs, mid_chs, exp_kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type)
+        self.bn1 = norm_act_layer(mid_chs, inplace=True)

        # Squeeze-and-excitation
        self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity()

        # Point-wise linear projection
        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type)
-        self.bn2 = norm_layer(out_chs)
+        self.bn2 = norm_act_layer(out_chs, apply_act=False)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity()

    def feature_info(self, location):
        if location == 'expansion':  # after SE, before PWL
-            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+            return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
        else:  # location == 'bottleneck', block output
-            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
-        return info
+            return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)

    def forward(self, x):
        shortcut = x
-
-        # Expansion convolution
        x = self.conv_exp(x)
        x = self.bn1(x)
-        x = self.act1(x)
-
-        # Squeeze-and-excitation
        x = self.se(x)
-
-        # Point-wise linear projection
        x = self.conv_pwl(x)
        x = self.bn2(x)
-
-        if self.has_residual:
-            if self.drop_path_rate > 0.:
-                x = drop_path(x, self.drop_path_rate, self.training)
-            x += shortcut
-
+        if self.has_skip:
+            x = x + self.drop_path(shortcut)
        return x
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -139,60 +139,52 @@ def _decode_block_str(block_str):
    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
    force_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
-
    num_repeat = int(options['r'])
+
    # each type of block has different valid arguments, fill accordingly
+    block_args = dict(
+        block_type=block_type,
+        out_chs=int(options['c']),
+        stride=int(options['s']),
+        act_layer=act_layer,
+    )
    if block_type == 'ir':
-        block_args = dict(
-            block_type=block_type,
+        block_args.update(dict(
            dw_kernel_size=_parse_ksize(options['k']),
            exp_kernel_size=exp_kernel_size,
            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
            exp_ratio=float(options['e']),
            se_ratio=float(options['se']) if 'se' in options else 0.,
-            stride=int(options['s']),
-            act_layer=act_layer,
            noskip=skip is False,
-        )
+        ))
        if 'cc' in options:
            block_args['num_experts'] = int(options['cc'])
    elif block_type == 'ds' or block_type == 'dsa':
-        block_args = dict(
-            block_type=block_type,
+        block_args.update(dict(
            dw_kernel_size=_parse_ksize(options['k']),
            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
            se_ratio=float(options['se']) if 'se' in options else 0.,
-            stride=int(options['s']),
-            act_layer=act_layer,
            pw_act=block_type == 'dsa',
            noskip=block_type == 'dsa' or skip is False,
-        )
+        ))
    elif block_type == 'er':
-        block_args = dict(
-            block_type=block_type,
+        block_args.update(dict(
            exp_kernel_size=_parse_ksize(options['k']),
            pw_kernel_size=pw_kernel_size,
-            out_chs=int(options['c']),
            exp_ratio=float(options['e']),
            force_in_chs=force_in_chs,
            se_ratio=float(options['se']) if 'se' in options else 0.,
-            stride=int(options['s']),
-            act_layer=act_layer,
            noskip=skip is False,
-        )
+        ))
    elif block_type == 'cn':
-        block_args = dict(
-            block_type=block_type,
+        block_args.update(dict(
            kernel_size=int(options['k']),
-            out_chs=int(options['c']),
-            stride=int(options['s']),
-            act_layer=act_layer,
            skip=skip is True,
-        )
+        ))
    else:
        assert False, 'Unknown block type (%s)' % block_type
+    if 'gs' in options:
+        block_args['group_size'] = options['gs']

    return block_args, num_repeat

@ -235,7 +227,27 @@ def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='c
    return sa_scaled


-def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
+def decode_arch_def(
+        arch_def,
+        depth_multiplier=1.0,
+        depth_trunc='ceil',
+        experts_multiplier=1,
+        fix_first_last=False,
+        group_size=None,
+):
+    """ Decode block architecture definition strings -> block kwargs
+
+    Args:
+        arch_def: architecture definition strings, list of list of strings
+        depth_multiplier: network depth multiplier
+        depth_trunc: networ depth truncation mode when applying multiplier
+        experts_multiplier: CondConv experts multiplier
+        fix_first_last: fix first and last block depths when multiplier is applied
+        group_size: group size override for all blocks that weren't explicitly set in arch string
+
+    Returns:
+        list of list of block kwargs
+    """
    arch_args = []
    if isinstance(depth_multiplier, tuple):
        assert len(depth_multiplier) == len(arch_def)
@ -250,6 +262,8 @@ def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_
            ba, rep = _decode_block_str(block_str)
            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
                ba['num_experts'] *= experts_multiplier
+            if group_size is not None:
+                ba.setdefault('group_size', group_size)
            stack_args.append(ba)
            repeats.append(rep)
        if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -7,11 +7,11 @@ from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\
    set_layer_config
 from .conv2d_same import Conv2dSame, conv2d_same
-from .conv_bn_act import ConvBnAct
+from .conv_bn_act import ConvNormAct, ConvNormActAa, ConvBnAct
 from .create_act import create_act_layer, get_act_layer, get_act_fn
 from .create_attn import get_attn, create_attn
 from .create_conv2d import create_conv2d
-from .create_norm_act import get_norm_act_layer, create_norm_act, convert_norm_act
+from .create_norm_act import get_norm_act_layer, create_norm_act_layer, get_norm_act_layer
 from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
 from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn
 from .evo_norm import EvoNorm2dB0, EvoNorm2dB1, EvoNorm2dB2,\
@ -32,7 +32,7 @@ from .patch_embed import PatchEmbed
 from .pool2d_same import AvgPool2dSame, create_pool2d
 from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite
 from .selective_kernel import SelectiveKernel
-from .separable_conv import SeparableConv2d, SeparableConvBnAct
+from .separable_conv import SeparableConv2d, SeparableConvNormAct
 from .space_to_depth import SpaceToDepthModule
 from .split_attn import SplitAttn
 from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
--- a/timm/models/layers/cbam.py
+++ b/timm/models/layers/cbam.py
@ -11,7 +11,7 @@ import torch
 from torch import nn as nn
 import torch.nn.functional as F

-from .conv_bn_act import ConvBnAct
+from .conv_bn_act import ConvNormAct
 from .create_act import create_act_layer, get_act_layer
 from .helpers import make_divisible

@ -56,7 +56,7 @@ class SpatialAttn(nn.Module):
    """
    def __init__(self, kernel_size=7, gate_layer='sigmoid'):
        super(SpatialAttn, self).__init__()
-        self.conv = ConvBnAct(2, 1, kernel_size, act_layer=None)
+        self.conv = ConvNormAct(2, 1, kernel_size, apply_act=False)
        self.gate = create_act_layer(gate_layer)

    def forward(self, x):
@ -70,7 +70,7 @@ class LightSpatialAttn(nn.Module):
    """
    def __init__(self, kernel_size=7, gate_layer='sigmoid'):
        super(LightSpatialAttn, self).__init__()
-        self.conv = ConvBnAct(1, 1, kernel_size, act_layer=None)
+        self.conv = ConvNormAct(1, 1, kernel_size, apply_act=False)
        self.gate = create_act_layer(gate_layer)

    def forward(self, x):
--- a/timm/models/layers/conv_bn_act.py
+++ b/timm/models/layers/conv_bn_act.py
@ -5,14 +5,46 @@ Hacked together by / Copyright 2020 Ross Wightman
 from torch import nn as nn

 from .create_conv2d import create_conv2d
-from .create_norm_act import convert_norm_act
+from .create_norm_act import get_norm_act_layer


-class ConvBnAct(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
-                 bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, aa_layer=None,
-                 drop_block=None):
-        super(ConvBnAct, self).__init__()
+class ConvNormAct(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
+            bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, drop_layer=None):
+        super(ConvNormAct, self).__init__()
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=bias)
+
+        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        # NOTE for backwards (weight) compatibility, norm layer name remains `.bn`
+        norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {}
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs)
+
+    @property
+    def in_channels(self):
+        return self.conv.in_channels
+
+    @property
+    def out_channels(self):
+        return self.conv.out_channels
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+ConvBnAct = ConvNormAct
+
+
+class ConvNormActAa(nn.Module):
+    def __init__(
+            self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
+            bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, aa_layer=None, drop_layer=None):
+        super(ConvNormActAa, self).__init__()
        use_aa = aa_layer is not None

        self.conv = create_conv2d(
@ -20,9 +52,11 @@ class ConvBnAct(nn.Module):
            padding=padding, dilation=dilation, groups=groups, bias=bias)

        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
-        norm_act_layer = convert_norm_act(norm_layer, act_layer)
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block)
-        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        # NOTE for backwards (weight) compatibility, norm layer name remains `.bn`
+        norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {}
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs)
+        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else nn.Identity()

    @property
    def in_channels(self):
@ -35,6 +69,5 @@ class ConvBnAct(nn.Module):
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
-        if self.aa is not None:
-            x = self.aa(x)
+        x = self.aa(x)
        return x
--- a/timm/models/layers/create_conv2d.py
+++ b/timm/models/layers/create_conv2d.py
@ -16,7 +16,12 @@ def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
    """
    if isinstance(kernel_size, list):
        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
-        assert 'groups' not in kwargs  # MixedConv groups are defined by kernel list
+        if 'groups' in kwargs:
+            groups = kwargs.pop('groups')
+            if groups == in_channels:
+                kwargs['depthwise'] = True
+            else:
+                assert groups == 1
        # We're going to use only lists for defining the MixedConv2d kernel groups,
        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
--- a/timm/models/layers/create_norm_act.py
+++ b/timm/models/layers/create_norm_act.py
@ -11,12 +11,15 @@ import functools

 from .evo_norm import *
 from .filter_response_norm import FilterResponseNormAct2d, FilterResponseNormTlu2d
-from .norm_act import BatchNormAct2d, GroupNormAct
+from .norm_act import BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d
 from .inplace_abn import InplaceAbn

 _NORM_ACT_MAP = dict(
    batchnorm=BatchNormAct2d,
+    batchnorm2d=BatchNormAct2d,
    groupnorm=GroupNormAct,
+    layernorm=LayerNormAct,
+    layernorm2d=LayerNormAct2d,
    evonormb0=EvoNorm2dB0,
    evonormb1=EvoNorm2dB1,
    evonormb2=EvoNorm2dB2,
@ -33,28 +36,19 @@ _NORM_ACT_MAP = dict(
 )
 _NORM_ACT_TYPES = {m for n, m in _NORM_ACT_MAP.items()}
 # has act_layer arg to define act type
-_NORM_ACT_REQUIRES_ARG = {BatchNormAct2d, GroupNormAct, FilterResponseNormAct2d, InplaceAbn}
+_NORM_ACT_REQUIRES_ARG = {
+    BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d, FilterResponseNormAct2d, InplaceAbn}


-def get_norm_act_layer(layer_name):
-    layer_name = layer_name.replace('_', '').lower().split('-')[0]
-    layer = _NORM_ACT_MAP.get(layer_name, None)
-    assert layer is not None, "Invalid norm_act layer (%s)" % layer_name
-    return layer
-
-
-def create_norm_act(layer_name, num_features, apply_act=True, jit=False, **kwargs):
-    layer_parts = layer_name.split('-')  # e.g. batchnorm-leaky_relu
-    assert len(layer_parts) in (1, 2)
-    layer = get_norm_act_layer(layer_parts[0])
-    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection?
+def create_norm_act_layer(layer_name, num_features, act_layer=None, apply_act=True, jit=False, **kwargs):
+    layer = get_norm_act_layer(layer_name, act_layer=act_layer)
    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
    if jit:
        layer_instance = torch.jit.script(layer_instance)
    return layer_instance


-def convert_norm_act(norm_layer, act_layer):
+def get_norm_act_layer(norm_layer, act_layer=None):
    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
    norm_act_kwargs = {}
@ -65,7 +59,8 @@ def convert_norm_act(norm_layer, act_layer):
        norm_layer = norm_layer.func

    if isinstance(norm_layer, str):
-        norm_act_layer = get_norm_act_layer(norm_layer)
+        layer_name = norm_layer.replace('_', '').lower().split('-')[0]
+        norm_act_layer = _NORM_ACT_MAP.get(layer_name, None)
    elif norm_layer in _NORM_ACT_TYPES:
        norm_act_layer = norm_layer
    elif isinstance(norm_layer,  types.FunctionType):
@ -77,6 +72,10 @@ def convert_norm_act(norm_layer, act_layer):
            norm_act_layer = BatchNormAct2d
        elif type_name.startswith('groupnorm'):
            norm_act_layer = GroupNormAct
+        elif type_name.startswith('layernorm2d'):
+            norm_act_layer = LayerNormAct2d
+        elif type_name.startswith('layernorm'):
+            norm_act_layer = LayerNormAct
        else:
            assert False, f"No equivalent norm_act layer for {type_name}"

--- a/timm/models/layers/drop.py
+++ b/timm/models/layers/drop.py
@ -20,7 +20,7 @@ import torch.nn.functional as F


 def drop_block_2d(
-        x, drop_prob: float = 0.1, block_size: int = 7,  gamma_scale: float = 1.0,
+        x, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0,
        with_noise: bool = False, inplace: bool = False, batchwise: bool = False):
    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf

@ -32,7 +32,7 @@ def drop_block_2d(
    clipped_block_size = min(block_size, min(W, H))
    # seed_drop_rate, the gamma parameter
    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
-        (W - block_size + 1) * (H - block_size + 1))
+            (W - block_size + 1) * (H - block_size + 1))

    # Forces the block to be inside the feature map.
    w_i, h_i = torch.meshgrid(torch.arange(W).to(x.device), torch.arange(H).to(x.device))
@ -104,14 +104,16 @@ def drop_block_fast_2d(
 class DropBlock2d(nn.Module):
    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
    """
-    def __init__(self,
-                 drop_prob=0.1,
-                 block_size=7,
-                 gamma_scale=1.0,
-                 with_noise=False,
-                 inplace=False,
-                 batchwise=False,
-                 fast=True):
+
+    def __init__(
+            self,
+            drop_prob=0.1,
+            block_size=7,
+            gamma_scale=1.0,
+            with_noise=False,
+            inplace=False,
+            batchwise=False,
+            fast=True):
        super(DropBlock2d, self).__init__()
        self.drop_prob = drop_prob
        self.gamma_scale = gamma_scale
@ -155,6 +157,7 @@ def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: b
 class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
+
    def __init__(self, drop_prob=None, scale_by_keep=True):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
--- a/timm/models/layers/evo_norm.py
+++ b/timm/models/layers/evo_norm.py
@ -23,6 +23,7 @@ GPU, similar train speeds for EvoNormS variants and BatchNorm.

 Hacked together by / Copyright 2020 Ross Wightman
 """
+from typing import Sequence, Union

 import torch
 import torch.nn as nn
@ -33,41 +34,57 @@ from .trace_utils import _assert


 def instance_std(x, eps: float = 1e-5):
-    rms = x.float().var(dim=(2, 3), unbiased=False, keepdim=True).add(eps).sqrt().to(x.dtype)
-    return rms.expand(x.shape)
+    std = x.float().var(dim=(2, 3), unbiased=False, keepdim=True).add(eps).sqrt().to(x.dtype)
+    return std.expand(x.shape)
+
+
+def instance_std_tpu(x, eps: float = 1e-5):
+    std = manual_var(x, dim=(2, 3)).add(eps).sqrt()
+    return std.expand(x.shape)
+# instance_std = instance_std_tpu


 def instance_rms(x, eps: float = 1e-5):
-    rms = x.square().float().mean(dim=(2, 3), keepdim=True).add(eps).sqrt().to(dtype=x.dtype)
+    rms = x.float().square().mean(dim=(2, 3), keepdim=True).add(eps).sqrt().to(x.dtype)
    return rms.expand(x.shape)


+def manual_var(x, dim: Union[int, Sequence[int]], diff_sqm: bool = False):
+    xm = x.mean(dim=dim, keepdim=True)
+    if diff_sqm:
+        # difference of squared mean and mean squared, faster on TPU can be less stable
+        var = ((x * x).mean(dim=dim, keepdim=True) - (xm * xm)).clamp(0)
+    else:
+        var = ((x - xm) * (x - xm)).mean(dim=dim, keepdim=True)
+    return var
+
+
 def group_std(x, groups: int = 32, eps: float = 1e-5, flatten: bool = False):
    B, C, H, W = x.shape
    x_dtype = x.dtype
    _assert(C % groups == 0, '')
-    # x = x.reshape(B, groups, -1)  # FIXME simpler shape causing TPU / XLA issues
-    # std = x.float().var(dim=2, unbiased=False, keepdim=True).add(eps).sqrt()
-    x = x.reshape(B, groups, C // groups, H, W)
-    std = x.float().var(dim=(2, 3, 4), unbiased=False, keepdim=True).add(eps).sqrt()
-    return std.expand(x.shape).reshape(B, C, H, W).to(x_dtype)
+    if flatten:
+        x = x.reshape(B, groups, -1)  # FIXME simpler shape causing TPU / XLA issues
+        std = x.float().var(dim=2, unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype)
+    else:
+        x = x.reshape(B, groups, C // groups, H, W)
+        std = x.float().var(dim=(2, 3, 4), unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype)
+    return std.expand(x.shape).reshape(B, C, H, W)


-def group_std_tpu(x, groups: int = 32, eps: float = 1e-5, diff_sqm: bool = False):
+def group_std_tpu(x, groups: int = 32, eps: float = 1e-5, diff_sqm: bool = False, flatten: bool = False):
    # This is a workaround for some stability / odd behaviour of .var and .std
    # running on PyTorch XLA w/ TPUs. These manual var impl are producing much better results
    B, C, H, W = x.shape
    _assert(C % groups == 0, '')
-    x_dtype = x.dtype
-    x = x.float().reshape(B, groups, C // groups, H, W)
-    xm = x.mean(dim=(2, 3, 4), keepdim=True)
-    if diff_sqm:
-        # difference of squared mean and mean squared, faster on TPU
-        var = (x.square().mean(dim=(2, 3, 4), keepdim=True) - xm.square()).clamp(0)
+    if flatten:
+        x = x.reshape(B, groups, -1)  # FIXME simpler shape causing TPU / XLA issues
+        var = manual_var(x, dim=-1, diff_sqm=diff_sqm)
    else:
-        var = (x - xm).square().mean(dim=(2, 3, 4), keepdim=True)
-    return var.add(eps).sqrt().expand(x.shape).reshape(B, C, H, W).to(x_dtype)
-# group_std = group_std_tpu  # temporary, for TPU / PT XLA
+        x = x.reshape(B, groups, C // groups, H, W)
+        var = manual_var(x, dim=(2, 3, 4), diff_sqm=diff_sqm)
+    return var.add(eps).sqrt().expand(x.shape).reshape(B, C, H, W)
+#group_std = group_std_tpu  # FIXME TPU temporary


 def group_rms(x, groups: int = 32, eps: float = 1e-5):
@ -75,8 +92,8 @@ def group_rms(x, groups: int = 32, eps: float = 1e-5):
    _assert(C % groups == 0, '')
    x_dtype = x.dtype
    x = x.reshape(B, groups, C // groups, H, W)
-    sqm = x.square().mean(dim=(2, 3, 4), keepdim=True).add(eps).sqrt_().to(dtype=x_dtype)
-    return sqm.expand(x.shape).reshape(B, C, H, W)
+    rms = x.float().square().mean(dim=(2, 3, 4), keepdim=True).add(eps).sqrt_().to(dtype=x_dtype)
+    return rms.expand(x.shape).reshape(B, C, H, W)


 class EvoNorm2dB0(nn.Module):
@ -104,6 +121,7 @@ class EvoNorm2dB0(nn.Module):
        if self.v is not None:
            if self.training:
                var = x.float().var(dim=(0, 2, 3), unbiased=False)
+                # var = manual_var(x, dim=(0, 2, 3)).squeeze()
                n = x.numel() / x.shape[1]
                self.running_var.copy_(
                    self.running_var * (1 - self.momentum) +
@ -230,7 +248,7 @@ class EvoNorm2dS0a(EvoNorm2dS0):
        d = group_std(x, self.groups, self.eps)
        if self.v is not None:
            v = self.v.view(v_shape).to(dtype=x_dtype)
-            x = x * (x * v).sigmoid_()
+            x = x * (x * v).sigmoid()
        x = x / d
        return x * self.weight.view(v_shape).to(dtype=x_dtype) + self.bias.view(v_shape).to(dtype=x_dtype)

--- a/timm/models/layers/inplace_abn.py
+++ b/timm/models/layers/inplace_abn.py
@ -38,7 +38,7 @@ class InplaceAbn(nn.Module):
    """

    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
-                 act_layer="leaky_relu", act_param=0.01, drop_block=None):
+                 act_layer="leaky_relu", act_param=0.01, drop_layer=None):
        super(InplaceAbn, self).__init__()
        self.num_features = num_features
        self.affine = affine
@ -54,7 +54,7 @@ class InplaceAbn(nn.Module):
                    self.act_name = 'elu'
                elif act_layer == nn.LeakyReLU:
                    self.act_name = 'leaky_relu'
-                elif act_layer == nn.Identity:
+                elif act_layer is None or act_layer == nn.Identity:
                    self.act_name = 'identity'
                else:
                    assert False, f'Invalid act layer {act_layer.__name__} for IABN'
--- a/timm/models/layers/non_local_attn.py
+++ b/timm/models/layers/non_local_attn.py
@ -8,7 +8,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F

-from .conv_bn_act import ConvBnAct
+from .conv_bn_act import ConvNormAct
 from .helpers import make_divisible
 from .trace_utils import _assert

@ -74,10 +74,10 @@ class BilinearAttnTransform(nn.Module):
    def __init__(self, in_channels, block_size, groups, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        super(BilinearAttnTransform, self).__init__()

-        self.conv1 = ConvBnAct(in_channels, groups, 1, act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvNormAct(in_channels, groups, 1, act_layer=act_layer, norm_layer=norm_layer)
        self.conv_p = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(block_size, 1))
        self.conv_q = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(1, block_size))
-        self.conv2 = ConvBnAct(in_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
+        self.conv2 = ConvNormAct(in_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
        self.block_size = block_size
        self.groups = groups
        self.in_channels = in_channels
@ -132,9 +132,9 @@ class BatNonLocalAttn(nn.Module):
        super().__init__()
        if rd_channels is None:
            rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor)
-        self.conv1 = ConvBnAct(in_channels, rd_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvNormAct(in_channels, rd_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
        self.ba = BilinearAttnTransform(rd_channels, block_size, groups, act_layer=act_layer, norm_layer=norm_layer)
-        self.conv2 = ConvBnAct(rd_channels, in_channels, 1,  act_layer=act_layer, norm_layer=norm_layer)
+        self.conv2 = ConvNormAct(rd_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer)
        self.dropout = nn.Dropout2d(p=drop_rate)

    def forward(self, x):
--- a/timm/models/layers/norm_act.py
+++ b/timm/models/layers/norm_act.py
@ -1,5 +1,7 @@
 """ Normalization + Activation Layers
 """
+from typing import Union, List
+
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@ -14,12 +16,13 @@ class BatchNormAct2d(nn.BatchNorm2d):
    compatible with weights trained with separate bn, act. This is why we inherit from BN
    instead of composing it as a .bn member.
    """
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
-                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+    def __init__(
+            self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+            apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None):
        super(BatchNormAct2d, self).__init__(
            num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
-        if isinstance(act_layer, str):
-            act_layer = get_act_layer(act_layer)
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
+        act_layer = get_act_layer(act_layer)  # string -> nn.Module
        if act_layer is not None and apply_act:
            act_args = dict(inplace=True) if inplace else {}
            self.act = act_layer(**act_args)
@ -29,8 +32,8 @@ class BatchNormAct2d(nn.BatchNorm2d):
    def _forward_jit(self, x):
        """ A cut & paste of the contents of the PyTorch BatchNorm2d forward function
        """
-        # exponential_average_factor is self.momentum set to
-        # (when it is available) only so that if gets updated
+        # exponential_average_factor is set to self.momentum
+        # (when it is available) only so that it gets updated
        # in ONNX graph when this node is exported to ONNX.
        if self.momentum is None:
            exponential_average_factor = 0.0
@ -39,18 +42,38 @@ class BatchNormAct2d(nn.BatchNorm2d):

        if self.training and self.track_running_stats:
            # TODO: if statement only here to tell the jit to skip emitting this when it is None
-            if self.num_batches_tracked is not None:
-                self.num_batches_tracked += 1
+            if self.num_batches_tracked is not None:  # type: ignore[has-type]
+                self.num_batches_tracked = self.num_batches_tracked + 1  # type: ignore[has-type]
                if self.momentum is None:  # use cumulative moving average
                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
                else:  # use exponential moving average
                    exponential_average_factor = self.momentum

-        x = F.batch_norm(
-                x, self.running_mean, self.running_var, self.weight, self.bias,
-                self.training or not self.track_running_stats,
-                exponential_average_factor, self.eps)
-        return x
+        r"""
+        Decide whether the mini-batch stats should be used for normalization rather than the buffers.
+        Mini-batch stats are used in training mode, and in eval mode when buffers are None.
+        """
+        if self.training:
+            bn_training = True
+        else:
+            bn_training = (self.running_mean is None) and (self.running_var is None)
+
+        r"""
+        Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
+        passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
+        used for normalization (i.e. in eval mode when buffers are not None).
+        """
+        return F.batch_norm(
+            x,
+            # If buffers are not to be tracked, ensure that they won't be updated
+            self.running_mean if not self.training or self.track_running_stats else None,
+            self.running_var if not self.training or self.track_running_stats else None,
+            self.weight,
+            self.bias,
+            bn_training,
+            exponential_average_factor,
+            self.eps,
+        )

    @torch.jit.ignore
    def _forward_python(self, x):
@ -62,17 +85,27 @@ class BatchNormAct2d(nn.BatchNorm2d):
            x = self._forward_jit(x)
        else:
            x = self._forward_python(x)
+        x = self.drop(x)
        x = self.act(x)
        return x


+def _num_groups(num_channels, num_groups, group_size):
+    if group_size:
+        assert num_channels % group_size == 0
+        return num_channels // group_size
+    return num_groups
+
+
 class GroupNormAct(nn.GroupNorm):
    # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args
-    def __init__(self, num_channels, num_groups=32, eps=1e-5, affine=True,
-                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
-        super(GroupNormAct, self).__init__(num_groups, num_channels, eps=eps, affine=affine)
-        if isinstance(act_layer, str):
-            act_layer = get_act_layer(act_layer)
+    def __init__(
+            self, num_channels, num_groups=32, eps=1e-5, affine=True, group_size=None,
+            apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None):
+        super(GroupNormAct, self).__init__(
+            _num_groups(num_channels, num_groups, group_size), num_channels, eps=eps, affine=affine)
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
+        act_layer = get_act_layer(act_layer)  # string -> nn.Module
        if act_layer is not None and apply_act:
            act_args = dict(inplace=True) if inplace else {}
            self.act = act_layer(**act_args)
@ -81,5 +114,47 @@ class GroupNormAct(nn.GroupNorm):

    def forward(self, x):
        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        x = self.drop(x)
+        x = self.act(x)
+        return x
+
+
+class LayerNormAct(nn.LayerNorm):
+    def __init__(
+            self, normalization_shape: Union[int, List[int], torch.Size], eps=1e-5, affine=True,
+            apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None):
+        super(LayerNormAct, self).__init__(normalization_shape, eps=eps, elementwise_affine=affine)
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
+        act_layer = get_act_layer(act_layer)  # string -> nn.Module
+        if act_layer is not None and apply_act:
+            act_args = dict(inplace=True) if inplace else {}
+            self.act = act_layer(**act_args)
+        else:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = self.drop(x)
+        x = self.act(x)
+        return x
+
+
+class LayerNormAct2d(nn.LayerNorm):
+    def __init__(
+            self, num_channels, eps=1e-5, affine=True,
+            apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None):
+        super(LayerNormAct2d, self).__init__(num_channels, eps=eps, elementwise_affine=affine)
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
+        act_layer = get_act_layer(act_layer)  # string -> nn.Module
+        if act_layer is not None and apply_act:
+            act_args = dict(inplace=True) if inplace else {}
+            self.act = act_layer(**act_args)
+        else:
+            self.act = nn.Identity()
+
+    def forward(self, x):
+        x = F.layer_norm(
+            x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2)
+        x = self.drop(x)
        x = self.act(x)
        return x
--- a/timm/models/layers/pooled_attn.py
+++ b/timm/models/layers/pooled_attn.py
@ -0,0 +1,143 @@
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .helpers import to_2tuple
+from .weight_init import trunc_normal_
+
+
+def rel_logits_1d(q, rel_k, permute_mask: List[int]):
+    """ Compute relative logits along one dimension
+
+    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
+    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
+
+    Args:
+        q: (batch, heads, height, width, dim)
+        rel_k: (2 * width - 1, dim)
+        permute_mask: permute output dim according to this
+    """
+    B, H, W, dim = q.shape
+    x = (q @ rel_k.transpose(-1, -2))
+    x = x.reshape(-1, W, 2 * W -1)
+
+    # pad to shift from relative to absolute indexing
+    x_pad = F.pad(x, [0, 1]).flatten(1)
+    x_pad = F.pad(x_pad, [0, W - 1])
+
+    # reshape and slice out the padded elements
+    x_pad = x_pad.reshape(-1, W + 1, 2 * W - 1)
+    x = x_pad[:, :W, W - 1:]
+
+    # reshape and tile
+    x = x.reshape(B, H, 1, W, W).expand(-1, -1, H, -1, -1)
+    return x.permute(permute_mask)
+
+
+class PosEmbedRel(nn.Module):
+    """ Relative Position Embedding
+    As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2
+    Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925
+    """
+    def __init__(self, feat_size, dim_head, scale):
+        super().__init__()
+        self.height, self.width = to_2tuple(feat_size)
+        self.dim_head = dim_head
+        self.scale = scale
+        self.height_rel = nn.Parameter(torch.randn(self.height * 2 - 1, dim_head) * self.scale)
+        self.width_rel = nn.Parameter(torch.randn(self.width * 2 - 1, dim_head) * self.scale)
+
+    def forward(self, q):
+        B, num_heads, HW, _ = q.shape
+
+        # relative logits in width dimension.
+        q = q.reshape(B * num_heads, self.height, self.width, -1)
+        rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4))
+
+        # relative logits in height dimension.
+        q = q.transpose(1, 2)
+        rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2))
+
+        rel_logits = rel_logits_h + rel_logits_w
+        rel_logits = rel_logits.reshape(B, num_heads, HW, HW)
+        return rel_logits
+
+
+class BottleneckAttn(nn.Module):
+    """ Bottleneck Attention
+    Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605
+    """
+    def __init__(self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, qkv_bias=False):
+        super().__init__()
+        assert feat_size is not None, 'A concrete feature size matching expected input (H, W) is required'
+        dim_out = dim_out or dim
+        assert dim_out % num_heads == 0
+        self.num_heads = num_heads
+        self.dim_out = dim_out
+        self.dim_head = dim_out // num_heads
+        self.scale = self.dim_head ** -0.5
+
+        self.qkv = nn.Conv2d(dim, self.dim_out * 3, 1, bias=qkv_bias)
+
+        # NOTE I'm only supporting relative pos embedding for now
+        self.pos_embed = PosEmbedRel(feat_size, dim_head=self.dim_head, scale=self.scale)
+
+        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5)
+        trunc_normal_(self.pos_embed.height_rel, std=self.scale)
+        trunc_normal_(self.pos_embed.width_rel, std=self.scale)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.pos_embed.height
+        assert W == self.pos_embed.width
+
+        x = self.qkv(x)  # B, 3 * num_heads * dim_head, H, W
+        x = x.reshape(B, -1, self.dim_head, H * W).transpose(-1, -2)
+        q, k, v = torch.split(x, self.num_heads, dim=1)
+
+        attn_logits = (q @ k.transpose(-1, -2)) * self.scale
+        attn_logits = attn_logits + self.pos_embed(q)  # B, num_heads, H * W, H * W
+
+        attn_out = attn_logits.softmax(dim=-1)
+        attn_out = (attn_out @ v).transpose(-1, -2).reshape(B, self.dim_out, H, W)  # B, dim_out, H, W
+        attn_out = self.pool(attn_out)
+        return attn_out
+
+
+class PoolingAttention(nn.Module):
+    def __init__(self, in_features: int, attention_features: int, segments: int, max_pool_kernel: int):
+        super(PoolingAttention, self).__init__()
+        self.attn = nn.Linear(in_features, attention_features * 5)
+        self.segments = segments
+        self.max_pool_kernel = max_pool_kernel
+
+    def forward(self, inp: torch.Tensor):  # Shape: [Batch, Sequence, Features]
+        batch, sequence, features = inp.size()
+        assert sequence % self.segments == 0
+
+        qry, key, val, seg, loc = self.attn(inp).chunk(5, 2)  # 5x Shape: [Batch, Sequence, AttentionFeatures]
+
+        aggregated = qry.mean(1, keepdim=True)  # Shape: [Batch, AttentionFeatures]
+        aggregated = torch.einsum("ba,bsa->bs", aggregated, key)  # Shape: [Batch, Sequence]
+        aggregated = F.softmax(aggregated, 1)
+        aggregated = torch.einsum("bs,bsa,bza->bza", aggregated, val,
+                                  qry)  # Shape: [Batch, Sequence, AttentionFeatures]
+
+        pooled_sequence = sequence // self.segments
+        segment_max_pooled = seg.view(batch, pooled_sequence, self.segments, -1)
+        segment_max_pooled = segment_max_pooled.max(2, keepdim=True)  # Shape: [Batch, PooledSequence, 1, AttentionFeatures]
+        segment_max_pooled = segment_max_pooled * qry.view(batch, pooled_sequence, self.segments, -1)  # Shape: [Batch, PooledSequence, PoolSize, AttentionFeatures]
+        segment_max_pooled = segment_max_pooled.view(batch, sequence, -1)  # Shape: [Batch, Sequence, AttentionFeatures]
+
+        loc = loc.transpose(1, 2)  # Shape: [Batch, AttentionFeatures, Sequence]
+        local_max_pooled = F.max_pool1d(loc, self.max_pool_kernel, 1, self.max_pool_kernel // 2)
+        local_max_pooled = local_max_pooled.transpose(1, 2)  # Shape: [Batch, Sequence, AttentionFeatures]
+
+        return aggregated + segment_max_pooled + local_max_pooled
--- a/timm/models/layers/selective_kernel.py
+++ b/timm/models/layers/selective_kernel.py
@ -7,7 +7,7 @@ Hacked together by / Copyright 2020 Ross Wightman
 import torch
 from torch import nn as nn

-from .conv_bn_act import ConvBnAct
+from .conv_bn_act import ConvNormActAa
 from .helpers import make_divisible
 from .trace_utils import _assert

@ -20,8 +20,7 @@ def _kernel_valid(k):


 class SelectiveKernelAttn(nn.Module):
-    def __init__(self, channels, num_paths=2, attn_channels=32,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+    def __init__(self, channels, num_paths=2, attn_channels=32, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        """ Selective Kernel Attention Module

        Selective Kernel attention mechanism factored out into its own module.
@ -51,7 +50,7 @@ class SelectiveKernel(nn.Module):

    def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1,
                 rd_ratio=1./16, rd_channels=None, rd_divisor=8, keep_3x3=True, split_input=True,
-                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_layer=None):
        """ Selective Kernel Convolution Module

        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
@ -72,9 +71,10 @@ class SelectiveKernel(nn.Module):
            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
                can be viewed as grouping by path, output expands to module out_channels count
-            drop_block (nn.Module): drop block module
            act_layer (nn.Module): activation layer to use
            norm_layer (nn.Module): batchnorm/norm layer to use
+            aa_layer (nn.Module): anti-aliasing module
+            drop_layer (nn.Module): spatial drop module in convs (drop block, etc)
        """
        super(SelectiveKernel, self).__init__()
        out_channels = out_channels or in_channels
@ -97,15 +97,14 @@ class SelectiveKernel(nn.Module):
        groups = min(out_channels, groups)

        conv_kwargs = dict(
-            stride=stride, groups=groups, drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer,
-            aa_layer=aa_layer)
+            stride=stride, groups=groups, act_layer=act_layer, norm_layer=norm_layer,
+            aa_layer=aa_layer, drop_layer=drop_layer)
        self.paths = nn.ModuleList([
-            ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
+            ConvNormActAa(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
            for k, d in zip(kernel_size, dilation)])

        attn_channels = rd_channels or make_divisible(out_channels * rd_ratio, divisor=rd_divisor)
        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
-        self.drop_block = drop_block

    def forward(self, x):
        if self.split_input:
--- a/timm/models/layers/separable_conv.py
+++ b/timm/models/layers/separable_conv.py
@ -8,16 +8,16 @@ Hacked together by / Copyright 2020 Ross Wightman
 from torch import nn as nn

 from .create_conv2d import create_conv2d
-from .create_norm_act import convert_norm_act
+from .create_norm_act import get_norm_act_layer


-class SeparableConvBnAct(nn.Module):
+class SeparableConvNormAct(nn.Module):
    """ Separable Conv w/ trailing Norm and Activation
    """
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU,
-                 apply_act=True, drop_block=None):
-        super(SeparableConvBnAct, self).__init__()
+                 apply_act=True, drop_layer=None):
+        super(SeparableConvNormAct, self).__init__()

        self.conv_dw = create_conv2d(
            in_channels, int(in_channels * channel_multiplier), kernel_size,
@ -26,8 +26,9 @@ class SeparableConvBnAct(nn.Module):
        self.conv_pw = create_conv2d(
            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)

-        norm_act_layer = convert_norm_act(norm_layer, act_layer)
-        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block)
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
+        norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {}
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs)

    @property
    def in_channels(self):
@ -40,11 +41,13 @@ class SeparableConvBnAct(nn.Module):
    def forward(self, x):
        x = self.conv_dw(x)
        x = self.conv_pw(x)
-        if self.bn is not None:
-            x = self.bn(x)
+        x = self.bn(x)
        return x


+SeparableConvBnAct = SeparableConvNormAct
+
+
 class SeparableConv2d(nn.Module):
    """ Separable Conv
    """
--- a/timm/models/layers/split_attn.py
+++ b/timm/models/layers/split_attn.py
@ -35,11 +35,10 @@ class SplitAttn(nn.Module):
    """
    def __init__(self, in_channels, out_channels=None, kernel_size=3, stride=1, padding=None,
                 dilation=1, groups=1, bias=False, radix=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8,
-                 act_layer=nn.ReLU, norm_layer=None, drop_block=None, **kwargs):
+                 act_layer=nn.ReLU, norm_layer=None, drop_layer=None, **kwargs):
        super(SplitAttn, self).__init__()
        out_channels = out_channels or in_channels
        self.radix = radix
-        self.drop_block = drop_block
        mid_chs = out_channels * radix
        if rd_channels is None:
            attn_chs = make_divisible(in_channels * radix * rd_ratio, min_value=32, divisor=rd_divisor)
@ -51,6 +50,7 @@ class SplitAttn(nn.Module):
            in_channels, mid_chs, kernel_size, stride, padding, dilation,
            groups=groups * radix, bias=bias, **kwargs)
        self.bn0 = norm_layer(mid_chs) if norm_layer else nn.Identity()
+        self.drop = drop_layer() if drop_layer is not None else nn.Identity()
        self.act0 = act_layer(inplace=True)
        self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups)
        self.bn1 = norm_layer(attn_chs) if norm_layer else nn.Identity()
@ -61,8 +61,7 @@ class SplitAttn(nn.Module):
    def forward(self, x):
        x = self.conv(x)
        x = self.bn0(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
+        x = self.drop(x)
        x = self.act0(x)

        B, RC, H, W = x.shape
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -20,7 +20,7 @@ from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficien
    round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .features import FeatureInfo, FeatureHooks
 from .helpers import build_model_with_cfg, default_cfg_for_features
-from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, hard_sigmoid
+from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, get_norm_act_layer
 from .registry import register_model

 __all__ = ['MobileNetV3', 'MobileNetV3Features']
@ -95,6 +95,7 @@ class MobileNetV3(nn.Module):
        super(MobileNetV3, self).__init__()
        act_layer = act_layer or nn.ReLU
        norm_layer = norm_layer or nn.BatchNorm2d
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer)
        se_layer = se_layer or SqueezeExcite
        self.num_classes = num_classes
        self.num_features = num_features
@ -103,8 +104,7 @@ class MobileNetV3(nn.Module):
        # Stem
        stem_size = round_chs_fn(stem_size)
        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
-        self.bn1 = norm_layer(stem_size)
-        self.act1 = act_layer(inplace=True)
+        self.bn1 = norm_act_layer(stem_size, inplace=True)

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
@ -125,7 +125,7 @@ class MobileNetV3(nn.Module):
        efficientnet_init_weights(self)

    def as_sequential(self):
-        layers = [self.conv_stem, self.bn1, self.act1]
+        layers = [self.conv_stem, self.bn1]
        layers.extend(self.blocks)
        layers.extend([self.global_pool, self.conv_head, self.act2])
        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
@ -144,7 +144,6 @@ class MobileNetV3(nn.Module):
    def forward_features(self, x):
        x = self.conv_stem(x)
        x = self.bn1(x)
-        x = self.act1(x)
        x = self.blocks(x)
        x = self.global_pool(x)
        x = self.conv_head(x)
--- a/timm/models/nasnet.py
+++ b/timm/models/nasnet.py
@ -9,7 +9,7 @@ import torch.nn as nn
 import torch.nn.functional as F

 from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, create_conv2d, create_pool2d, create_classifier
+from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier
 from .registry import register_model

 __all__ = ['NASNetALarge']
@ -420,7 +420,7 @@ class NASNetALarge(nn.Module):
        channels = self.num_features // 24
        # 24 is default value for the architecture

-        self.conv0 = ConvBnAct(
+        self.conv0 = ConvNormAct(
            in_channels=in_chans, out_channels=self.stem_size, kernel_size=3, padding=0, stride=2,
            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)

--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@ -13,7 +13,7 @@ import torch.nn as nn
 import torch.nn.functional as F

 from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, create_conv2d, create_pool2d, create_classifier
+from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier
 from .registry import register_model

 __all__ = ['PNASNet5Large']
@ -243,7 +243,7 @@ class PNASNet5Large(nn.Module):
        self.num_features = 4320
        assert output_stride == 32

-        self.conv_0 = ConvBnAct(
+        self.conv_0 = ConvNormAct(
            in_chans, 96, kernel_size=3, stride=2, padding=0,
            norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False)

--- a/timm/models/regnet.py
+++ b/timm/models/regnet.py
@ -15,45 +15,76 @@ Hacked together by / Copyright 2020 Ross Wightman
 """
 import numpy as np
 import torch.nn as nn
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Union, Callable

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, AvgPool2dSame, ConvBnAct, SEModule, DropPath
+from .helpers import build_model_with_cfg, named_apply
+from .layers import ClassifierHead, AvgPool2dSame, ConvNormAct, SEModule, DropPath, get_act_layer, GroupNormAct
 from .registry import register_model


-def _mcfg(**kwargs):
-    cfg = dict(se_ratio=0., bottle_ratio=1., stem_width=32)
-    cfg.update(**kwargs)
-    return cfg
+@dataclass
+class RegNetCfg:
+    depth: int = 21
+    w0: int = 80
+    wa: float = 42.63
+    wm: float = 2.66
+    group_size: int = 24
+    bottle_ratio: float = 1.
+    se_ratio: float = 0.
+    stem_width: int = 32
+    downsample: Optional[str] = 'conv1x1'
+    linear_out: bool = False
+    act_layer: Union[str, Callable] = 'relu'
+    norm_layer: Union[str, Callable] = 'batchnorm'


 # Model FLOPS = three trailing digits * 10^8
 model_cfgs = dict(
-    regnetx_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13),
-    regnetx_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22),
-    regnetx_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16),
-    regnetx_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16),
-    regnetx_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18),
-    regnetx_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25),
-    regnetx_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23),
-    regnetx_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17),
-    regnetx_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23),
-    regnetx_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19),
-    regnetx_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22),
-    regnetx_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23),
-    regnety_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25),
-    regnety_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25),
-    regnety_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25),
-    regnety_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25),
-    regnety_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25),
-    regnety_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25),
-    regnety_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25),
-    regnety_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25),
-    regnety_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25),
-    regnety_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25),
-    regnety_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25),
-    regnety_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25),
+    # RegNet-X
+    regnetx_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13),
+    regnetx_004=RegNetCfg(w0=24, wa=24.48, wm=2.54, group_size=16, depth=22),
+    regnetx_006=RegNetCfg(w0=48, wa=36.97, wm=2.24, group_size=24, depth=16),
+    regnetx_008=RegNetCfg(w0=56, wa=35.73, wm=2.28, group_size=16, depth=16),
+    regnetx_016=RegNetCfg(w0=80, wa=34.01, wm=2.25, group_size=24, depth=18),
+    regnetx_032=RegNetCfg(w0=88, wa=26.31, wm=2.25, group_size=48, depth=25),
+    regnetx_040=RegNetCfg(w0=96, wa=38.65, wm=2.43, group_size=40, depth=23),
+    regnetx_064=RegNetCfg(w0=184, wa=60.83, wm=2.07, group_size=56, depth=17),
+    regnetx_080=RegNetCfg(w0=80, wa=49.56, wm=2.88, group_size=120, depth=23),
+    regnetx_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19),
+    regnetx_160=RegNetCfg(w0=216, wa=55.59, wm=2.1, group_size=128, depth=22),
+    regnetx_320=RegNetCfg(w0=320, wa=69.86, wm=2.0, group_size=168, depth=23),
+
+    # RegNet-Y
+    regnety_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13, se_ratio=0.25),
+    regnety_004=RegNetCfg(w0=48, wa=27.89, wm=2.09, group_size=8, depth=16, se_ratio=0.25),
+    regnety_006=RegNetCfg(w0=48, wa=32.54, wm=2.32, group_size=16, depth=15, se_ratio=0.25),
+    regnety_008=RegNetCfg(w0=56, wa=38.84, wm=2.4, group_size=16, depth=14, se_ratio=0.25),
+    regnety_016=RegNetCfg(w0=48, wa=20.71, wm=2.65, group_size=24, depth=27, se_ratio=0.25),
+    regnety_032=RegNetCfg(w0=80, wa=42.63, wm=2.66, group_size=24, depth=21, se_ratio=0.25),
+    regnety_040=RegNetCfg(w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25),
+    regnety_064=RegNetCfg(w0=112, wa=33.22, wm=2.27, group_size=72, depth=25, se_ratio=0.25),
+    regnety_080=RegNetCfg(w0=192, wa=76.82, wm=2.19, group_size=56, depth=17, se_ratio=0.25),
+    regnety_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19, se_ratio=0.25),
+    regnety_160=RegNetCfg(w0=200, wa=106.23, wm=2.48, group_size=112, depth=18, se_ratio=0.25),
+    regnety_320=RegNetCfg(w0=232, wa=115.89, wm=2.53, group_size=232, depth=20, se_ratio=0.25),
+
+    # Experimental
+    regnety_040s_gn=RegNetCfg(
+        w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25,
+        act_layer='silu', norm_layer=partial(GroupNormAct, group_size=16)),
+
+    # RegNet-Z (unverified)
+    regnetz_005=RegNetCfg(
+        depth=21, w0=16, wa=10.7, wm=2.51, group_size=4, bottle_ratio=4.0, se_ratio=0.25,
+        downsample=None, linear_out=True, act_layer='silu',
+    ),
+    regnetz_040=RegNetCfg(
+        depth=28, w0=48, wa=14.5, wm=2.226, group_size=8, bottle_ratio=4.0, se_ratio=0.25,
+        downsample=None, linear_out=True, act_layer='silu',
+    ),
 )


@ -80,6 +111,7 @@ default_cfgs = dict(
    regnetx_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'),
    regnetx_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'),
    regnetx_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'),
+
    regnety_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'),
    regnety_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'),
    regnety_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'),
@ -96,6 +128,11 @@ default_cfgs = dict(
        url='https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth',  # from Facebook DeiT GitHub repository
        crop_pct=1.0, test_input_size=(3, 288, 288)),
    regnety_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'),
+
+    regnety_040s_gn=_cfg(url=''),
+
+    regnetz_005=_cfg(url=''),
+    regnetz_040=_cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
 )


@ -125,6 +162,40 @@ def generate_regnet(width_slope, width_initial, width_mult, depth, q=8):
    return widths, num_stages, max_stage, widths_cont


+def downsample_conv(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    dilation = dilation if kernel_size > 1 else 1
+    return ConvNormAct(
+        in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, norm_layer=norm_layer, apply_act=False)
+
+
+def downsample_avg(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None):
+    """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    pool = nn.Identity()
+    if stride > 1 or dilation > 1:
+        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+    return nn.Sequential(*[
+        pool, ConvNormAct(in_chs, out_chs, 1, stride=1, norm_layer=norm_layer, apply_act=False)])
+
+
+def create_shortcut(downsample_type, in_chs, out_chs, kernel_size, stride, dilation=(1, 1), norm_layer=None):
+    assert downsample_type in ('avg', 'conv1x1', '', None)
+    if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+        if not downsample_type:
+            return None  # no shortcut, no downsample
+        elif downsample_type == 'avg':
+            return downsample_avg(in_chs, out_chs, stride=stride, dilation=dilation[0], norm_layer=norm_layer)
+        else:
+            return downsample_conv(
+                in_chs, out_chs, kernel_size=kernel_size, stride=stride, dilation=dilation[0], norm_layer=norm_layer)
+    else:
+        return nn.Identity()  # identity shortcut (no downsample)
+
+
 class Bottleneck(nn.Module):
    """ RegNet Bottleneck

@ -132,97 +203,70 @@ class Bottleneck(nn.Module):
    after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels.
    """

-    def __init__(self, in_chs, out_chs, stride=1, dilation=1, bottleneck_ratio=1, group_width=1, se_ratio=0.25,
-                 downsample=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None,
-                 drop_block=None, drop_path=None):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=(1, 1), bottle_ratio=1, group_size=1, se_ratio=0.25,
+                 downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+                 drop_block=None, drop_path_rate=0.):
        super(Bottleneck, self).__init__()
-        bottleneck_chs = int(round(out_chs * bottleneck_ratio))
-        groups = bottleneck_chs // group_width
-
-        cargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
-        self.conv1 = ConvBnAct(in_chs, bottleneck_chs, kernel_size=1, **cargs)
-        self.conv2 = ConvBnAct(
-            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation,
-            groups=groups, **cargs)
+        act_layer = get_act_layer(act_layer)
+        bottleneck_chs = int(round(out_chs * bottle_ratio))
+        groups = bottleneck_chs // group_size
+
+        cargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvNormAct(in_chs, bottleneck_chs, kernel_size=1, **cargs)
+        self.conv2 = ConvNormAct(
+            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation[0],
+            groups=groups, drop_layer=drop_block, **cargs)
        if se_ratio:
            se_channels = int(round(in_chs * se_ratio))
-            self.se = SEModule(bottleneck_chs, rd_channels=se_channels)
+            self.se = SEModule(bottleneck_chs, rd_channels=se_channels, act_layer=act_layer)
        else:
-            self.se = None
-        cargs['act_layer'] = None
-        self.conv3 = ConvBnAct(bottleneck_chs, out_chs, kernel_size=1, **cargs)
-        self.act3 = act_layer(inplace=True)
-        self.downsample = downsample
-        self.drop_path = drop_path
-
-    def zero_init_last_bn(self):
+            self.se = nn.Identity()
+        self.conv3 = ConvNormAct(bottleneck_chs, out_chs, kernel_size=1, apply_act=False, **cargs)
+        self.act3 = nn.Identity() if linear_out else act_layer()
+        self.downsample = create_shortcut(downsample, in_chs, out_chs, 1, stride, dilation, norm_layer=norm_layer)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def zero_init_last(self):
        nn.init.zeros_(self.conv3.bn.weight)

    def forward(self, x):
        shortcut = x
        x = self.conv1(x)
        x = self.conv2(x)
-        if self.se is not None:
-            x = self.se(x)
+        x = self.se(x)
        x = self.conv3(x)
-        if self.drop_path is not None:
-            x = self.drop_path(x)
        if self.downsample is not None:
-            shortcut = self.downsample(shortcut)
-        x += shortcut
+            # NOTE stuck with downsample as the attr name due to weight compatibility
+            # now represents the shortcut, no shortcut if None, and non-downsample shortcut == nn.Identity()
+            x = x + self.drop_path(self.downsample(shortcut))
        x = self.act3(x)
        return x


-def downsample_conv(
-        in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None):
-    norm_layer = norm_layer or nn.BatchNorm2d
-    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
-    dilation = dilation if kernel_size > 1 else 1
-    return ConvBnAct(
-        in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, norm_layer=norm_layer, act_layer=None)
-
-
-def downsample_avg(
-        in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None):
-    """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment."""
-    norm_layer = norm_layer or nn.BatchNorm2d
-    avg_stride = stride if dilation == 1 else 1
-    pool = nn.Identity()
-    if stride > 1 or dilation > 1:
-        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
-        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
-    return nn.Sequential(*[
-        pool, ConvBnAct(in_chs, out_chs, 1, stride=1, norm_layer=norm_layer, act_layer=None)])
-
-
 class RegStage(nn.Module):
    """Stage (sequence of blocks w/ the same output shape)."""

-    def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio, group_width,
-                 block_fn=Bottleneck, se_ratio=0., drop_path_rates=None, drop_block=None):
+    def __init__(
+            self, depth, in_chs, out_chs, stride, dilation, bottle_ratio=1.0, group_size=8, block_fn=Bottleneck,
+            se_ratio=0., downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+            drop_path_rates=None, drop_block=None):
        super(RegStage, self).__init__()
-        block_kwargs = {}  # FIXME setup to pass various aa, norm, act layer common args
+        block_kwargs = dict(
+            bottle_ratio=bottle_ratio, group_size=group_size, se_ratio=se_ratio, downsample=downsample,
+            linear_out=linear_out, act_layer=act_layer, norm_layer=norm_layer, drop_block=drop_block)
        first_dilation = 1 if dilation in (1, 2) else 2
        for i in range(depth):
            block_stride = stride if i == 0 else 1
            block_in_chs = in_chs if i == 0 else out_chs
-            block_dilation = first_dilation if i == 0 else dilation
-            if drop_path_rates is not None and drop_path_rates[i] > 0.:
-                drop_path = DropPath(drop_path_rates[i])
-            else:
-                drop_path = None
-            if (block_in_chs != out_chs) or (block_stride != 1):
-                proj_block = downsample_conv(block_in_chs, out_chs, 1, block_stride, block_dilation)
-            else:
-                proj_block = None
-
+            block_dilation = (first_dilation, dilation)
+            dpr = drop_path_rates[i] if drop_path_rates is not None else 0.
            name = "b{}".format(i + 1)
            self.add_module(
                name, block_fn(
-                    block_in_chs, out_chs, block_stride, block_dilation, bottle_ratio, group_width, se_ratio,
-                    downsample=proj_block, drop_block=drop_block, drop_path=drop_path, **block_kwargs)
+                    block_in_chs, out_chs, stride=block_stride, dilation=block_dilation,
+                    drop_path_rate=dpr, **block_kwargs)
            )
+            first_dilation = dilation

    def forward(self, x):
        for block in self.children():
@ -231,33 +275,34 @@ class RegStage(nn.Module):


 class RegNet(nn.Module):
-    """RegNet model.
+    """RegNet-X, Y, and Z Models

    Paper: https://arxiv.org/abs/2003.13678
    Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
    """

-    def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg', drop_rate=0.,
-                 drop_path_rate=0., zero_init_last_bn=True):
+    def __init__(
+            self, cfg: RegNetCfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg',
+            drop_rate=0., drop_path_rate=0., zero_init_last=True):
        super().__init__()
-        # TODO add drop block, drop path, anti-aliasing, custom bn/act args
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        assert output_stride in (8, 16, 32)

        # Construct the stem
-        stem_width = cfg['stem_width']
-        self.stem = ConvBnAct(in_chans, stem_width, 3, stride=2)
+        stem_width = cfg.stem_width
+        self.stem = ConvNormAct(in_chans, stem_width, 3, stride=2, act_layer=cfg.act_layer, norm_layer=cfg.norm_layer)
        self.feature_info = [dict(num_chs=stem_width, reduction=2, module='stem')]

        # Construct the stages
        prev_width = stem_width
        curr_stride = 2
        stage_params = self._get_stage_params(cfg, output_stride=output_stride, drop_path_rate=drop_path_rate)
-        se_ratio = cfg['se_ratio']
        for i, stage_args in enumerate(stage_params):
            stage_name = "s{}".format(i + 1)
-            self.add_module(stage_name, RegStage(prev_width, **stage_args, se_ratio=se_ratio))
+            self.add_module(stage_name, RegStage(
+                in_chs=prev_width, se_ratio=cfg.se_ratio, downsample=cfg.downsample, linear_out=cfg.linear_out,
+                act_layer=cfg.act_layer, norm_layer=cfg.norm_layer, **stage_args))
            prev_width = stage_args['out_chs']
            curr_stride *= stage_args['stride']
            self.feature_info += [dict(num_chs=prev_width, reduction=curr_stride, module=stage_name)]
@ -267,31 +312,18 @@ class RegNet(nn.Module):
        self.head = ClassifierHead(
            in_chs=prev_width, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)

-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.ones_(m.weight)
-                nn.init.zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, mean=0.0, std=0.01)
-                nn.init.zeros_(m.bias)
-        if zero_init_last_bn:
-            for m in self.modules():
-                if hasattr(m, 'zero_init_last_bn'):
-                    m.zero_init_last_bn()
-
-    def _get_stage_params(self, cfg, default_stride=2, output_stride=32, drop_path_rate=0.):
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
+
+    def _get_stage_params(self, cfg: RegNetCfg, default_stride=2, output_stride=32, drop_path_rate=0.):
        # Generate RegNet ws per block
-        w_a, w_0, w_m, d = cfg['wa'], cfg['w0'], cfg['wm'], cfg['depth']
-        widths, num_stages, _, _ = generate_regnet(w_a, w_0, w_m, d)
+        widths, num_stages, _, _ = generate_regnet(cfg.wa, cfg.w0, cfg.wm, cfg.depth)

        # Convert to per stage format
        stage_widths, stage_depths = np.unique(widths, return_counts=True)

        # Use the same group width, bottleneck mult and stride for each stage
-        stage_groups = [cfg['group_w'] for _ in range(num_stages)]
-        stage_bottle_ratios = [cfg['bottle_ratio'] for _ in range(num_stages)]
+        stage_groups = [cfg.group_size for _ in range(num_stages)]
+        stage_bottle_ratios = [cfg.bottle_ratio for _ in range(num_stages)]
        stage_strides = []
        stage_dilations = []
        net_stride = 2
@ -305,11 +337,11 @@ class RegNet(nn.Module):
                net_stride *= stride
            stage_strides.append(stride)
            stage_dilations.append(dilation)
-        stage_dpr = np.split(np.linspace(0, drop_path_rate, d), np.cumsum(stage_depths[:-1]))
+        stage_dpr = np.split(np.linspace(0, drop_path_rate, cfg.depth), np.cumsum(stage_depths[:-1]))

        # Adjust the compatibility of ws and gws
        stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths, stage_bottle_ratios, stage_groups)
-        param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rates']
+        param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_size', 'drop_path_rates']
        stage_params = [
            dict(zip(param_names, params)) for params in
            zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_bottle_ratios, stage_groups,
@ -333,6 +365,19 @@ class RegNet(nn.Module):
        return x


+def _init_weights(module, name='', zero_init_last=False):
+    if isinstance(module, nn.Conv2d):
+        nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+    elif isinstance(module, nn.BatchNorm2d):
+        nn.init.ones_(module.weight)
+        nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
+        nn.init.zeros_(module.bias)
+    elif hasattr(module, 'zero_init_last'):
+        module.zero_init_last()
+
+
 def _filter_fn(state_dict):
    """ convert patch embedding weight from manual patchify + linear proj to conv"""
    if 'model' in state_dict:
@ -492,3 +537,27 @@ def regnety_160(pretrained=False, **kwargs):
 def regnety_320(pretrained=False, **kwargs):
    """RegNetY-32GF"""
    return _create_regnet('regnety_320', pretrained, **kwargs)
+
+
+@register_model
+def regnety_040s_gn(pretrained=False, **kwargs):
+    """RegNetY-4.0GF w/ GroupNorm """
+    return _create_regnet('regnety_040s_gn', pretrained, **kwargs)
+
+
+@register_model
+def regnetz_005(pretrained=False, **kwargs):
+    """RegNetZ-500MF
+    NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+    but it's not clear it is equivalent to paper model as not detailed in the paper.
+    """
+    return _create_regnet('regnetz_005', pretrained, **kwargs)
+
+
+@register_model
+def regnetz_040(pretrained=False, **kwargs):
+    """RegNetZ-4.0GF
+    NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+    but it's not clear it is equivalent to paper model as not detailed in the paper.
+    """
+    return _create_regnet('regnetz_040', pretrained, **kwargs)
--- a/timm/models/resnest.py
+++ b/timm/models/resnest.py
@ -75,7 +75,6 @@ class ResNestBottleneck(nn.Module):
        else:
            avd_stride = 0
        self.radix = radix
-        self.drop_block = drop_block

        self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False)
        self.bn1 = norm_layer(group_width)
@ -85,14 +84,16 @@ class ResNestBottleneck(nn.Module):
        if self.radix >= 1:
            self.conv2 = SplitAttn(
                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
-                dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_block=drop_block)
+                dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_layer=drop_block)
            self.bn2 = nn.Identity()
+            self.drop_block = nn.Identity()
            self.act2 = nn.Identity()
        else:
            self.conv2 = nn.Conv2d(
                group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation,
                dilation=first_dilation, groups=cardinality, bias=False)
            self.bn2 = norm_layer(group_width)
+            self.drop_block = drop_block() if drop_block is not None else nn.Identity()
            self.act2 = act_layer(inplace=True)
        self.avd_last = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and not avd_first else None

@ -109,8 +110,6 @@ class ResNestBottleneck(nn.Module):

        out = self.conv1(x)
        out = self.bn1(out)
-        if self.drop_block is not None:
-            out = self.drop_block(out)
        out = self.act1(out)

        if self.avd_first is not None:
@ -118,8 +117,7 @@ class ResNestBottleneck(nn.Module):

        out = self.conv2(out)
        out = self.bn2(out)
-        if self.drop_block is not None:
-            out = self.drop_block(out)
+        out = self.drop_block(out)
        out = self.act2(out)

        if self.avd_last is not None:
@ -127,8 +125,6 @@ class ResNestBottleneck(nn.Module):

        out = self.conv3(out)
        out = self.bn3(out)
-        if self.drop_block is not None:
-            out = self.drop_block(out)

        if self.downsample is not None:
            shortcut = self.downsample(x)
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -307,8 +307,9 @@ class BasicBlock(nn.Module):
            inplanes, first_planes, kernel_size=3, stride=1 if use_aa else stride, padding=first_dilation,
            dilation=first_dilation, bias=False)
        self.bn1 = norm_layer(first_planes)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
        self.act1 = act_layer(inplace=True)
-        self.aa = aa_layer(channels=first_planes, stride=stride) if use_aa else None
+        self.aa = aa_layer(channels=first_planes, stride=stride) if use_aa else nn.Identity()

        self.conv2 = nn.Conv2d(
            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
@ -320,7 +321,6 @@ class BasicBlock(nn.Module):
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
-        self.drop_block = drop_block
        self.drop_path = drop_path

    def zero_init_last_bn(self):
@ -331,16 +331,12 @@ class BasicBlock(nn.Module):

        x = self.conv1(x)
        x = self.bn1(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
+        x = self.drop_block(x)
        x = self.act1(x)
-        if self.aa is not None:
-            x = self.aa(x)
+        x = self.aa(x)

        x = self.conv2(x)
        x = self.bn2(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)

        if self.se is not None:
            x = self.se(x)
@ -378,8 +374,9 @@ class Bottleneck(nn.Module):
            first_planes, width, kernel_size=3, stride=1 if use_aa else stride,
            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
        self.bn2 = norm_layer(width)
+        self.drop_block = drop_block() if drop_block is not None else nn.Identity()
        self.act2 = act_layer(inplace=True)
-        self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
+        self.aa = aa_layer(channels=width, stride=stride) if use_aa else nn.Identity()

        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
@ -390,7 +387,6 @@ class Bottleneck(nn.Module):
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
-        self.drop_block = drop_block
        self.drop_path = drop_path

    def zero_init_last_bn(self):
@ -401,22 +397,16 @@ class Bottleneck(nn.Module):

        x = self.conv1(x)
        x = self.bn1(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
        x = self.act1(x)

        x = self.conv2(x)
        x = self.bn2(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
+        x = self.drop_block(x)
        x = self.act2(x)
-        if self.aa is not None:
-            x = self.aa(x)
+        x = self.aa(x)

        x = self.conv3(x)
        x = self.bn3(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)

        if self.se is not None:
            x = self.se(x)
@ -463,11 +453,11 @@ def downsample_avg(
    ])


-def drop_blocks(drop_block_rate=0.):
+def drop_blocks(drop_prob=0.):
    return [
        None, None,
-        DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
-        DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None]
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=5, gamma_scale=0.25) if drop_prob else None,
+        partial(DropBlock2d, drop_prob=drop_prob, block_size=3, gamma_scale=1.00) if drop_prob else None]


 def make_blocks(
--- a/timm/models/rexnet.py
+++ b/timm/models/rexnet.py
@ -17,7 +17,7 @@ from math import ceil

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath, make_divisible, SEModule
+from .layers import ClassifierHead, create_act_layer, ConvNormAct, DropPath, make_divisible, SEModule
 from .registry import register_model
 from .efficientnet_builder import efficientnet_init_weights

@ -63,19 +63,19 @@ class LinearBottleneck(nn.Module):

        if exp_ratio != 1.:
            dw_chs = make_divisible(round(in_chs * exp_ratio), divisor=ch_div)
-            self.conv_exp = ConvBnAct(in_chs, dw_chs, act_layer=act_layer)
+            self.conv_exp = ConvNormAct(in_chs, dw_chs, act_layer=act_layer)
        else:
            dw_chs = in_chs
            self.conv_exp = None

-        self.conv_dw = ConvBnAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False)
+        self.conv_dw = ConvNormAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False)
        if se_ratio > 0:
            self.se = SEWithNorm(dw_chs, rd_channels=make_divisible(int(dw_chs * se_ratio), ch_div))
        else:
            self.se = None
        self.act_dw = create_act_layer(dw_act_layer)

-        self.conv_pwl = ConvBnAct(dw_chs, out_chs, 1, apply_act=False)
+        self.conv_pwl = ConvNormAct(dw_chs, out_chs, 1, apply_act=False)
        self.drop_path = drop_path

    def feat_channels(self, exp=False):
@ -138,7 +138,7 @@ def _build_blocks(
        feat_chs += [features[-1].feat_channels()]
    pen_chs = make_divisible(1280 * width_mult, divisor=ch_div)
    feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=f'features.{len(features) - 1}')]
-    features.append(ConvBnAct(prev_chs, pen_chs, act_layer=act_layer))
+    features.append(ConvNormAct(prev_chs, pen_chs, act_layer=act_layer))
    return features, feature_info


@ -153,7 +153,7 @@ class ReXNetV1(nn.Module):
        assert output_stride == 32  # FIXME support dilation
        stem_base_chs = 32 / width_mult if width_mult < 1.0 else 32
        stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div)
-        self.stem = ConvBnAct(in_chans, stem_chs, 3, stride=2, act_layer=act_layer)
+        self.stem = ConvNormAct(in_chans, stem_chs, 3, stride=2, act_layer=act_layer)

        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div)
        features, self.feature_info = _build_blocks(
--- a/timm/models/sknet.py
+++ b/timm/models/sknet.py
@ -14,7 +14,7 @@ from torch import nn as nn

 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import SelectiveKernel, ConvBnAct, create_attn
+from .layers import SelectiveKernel, ConvNormAct, ConvNormActAa, create_attn
 from .registry import register_model
 from .resnet import ResNet

@ -52,7 +52,7 @@ class SelectiveKernelBasic(nn.Module):
        super(SelectiveKernelBasic, self).__init__()

        sk_kwargs = sk_kwargs or {}
-        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
+        conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
        assert base_width == 64, 'BasicBlock doest not support changing base width'
        first_planes = planes // reduce_first
@ -60,16 +60,13 @@ class SelectiveKernelBasic(nn.Module):
        first_dilation = first_dilation or dilation

        self.conv1 = SelectiveKernel(
-            inplanes, first_planes, stride=stride, dilation=first_dilation, **conv_kwargs, **sk_kwargs)
-        conv_kwargs['act_layer'] = None
-        self.conv2 = ConvBnAct(
-            first_planes, outplanes, kernel_size=3, dilation=dilation, **conv_kwargs)
+            inplanes, first_planes, stride=stride, dilation=first_dilation,
+            aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs)
+        self.conv2 = ConvNormAct(
+            first_planes, outplanes, kernel_size=3, dilation=dilation, apply_act=False, **conv_kwargs)
        self.se = create_attn(attn_layer, outplanes)
        self.act = act_layer(inplace=True)
        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
        self.drop_path = drop_path

    def zero_init_last_bn(self):
@ -100,24 +97,20 @@ class SelectiveKernelBottleneck(nn.Module):
        super(SelectiveKernelBottleneck, self).__init__()

        sk_kwargs = sk_kwargs or {}
-        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
+        conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
        width = int(math.floor(planes * (base_width / 64)) * cardinality)
        first_planes = width // reduce_first
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation

-        self.conv1 = ConvBnAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
+        self.conv1 = ConvNormAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
        self.conv2 = SelectiveKernel(
            first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality,
-            **conv_kwargs, **sk_kwargs)
-        conv_kwargs['act_layer'] = None
-        self.conv3 = ConvBnAct(width, outplanes, kernel_size=1, **conv_kwargs)
+            aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs)
+        self.conv3 = ConvNormAct(width, outplanes, kernel_size=1, apply_act=False, **conv_kwargs)
        self.se = create_attn(attn_layer, outplanes)
        self.act = act_layer(inplace=True)
        self.downsample = downsample
-        self.stride = stride
-        self.dilation = dilation
-        self.drop_block = drop_block
        self.drop_path = drop_path

    def zero_init_last_bn(self):
--- a/timm/models/vovnet.py
+++ b/timm/models/vovnet.py
@ -20,8 +20,8 @@ import torch.nn.functional as F
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .registry import register_model
 from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, DropPath,\
-    create_attn, create_norm_act, get_norm_act_layer
+from .layers import ConvNormAct, SeparableConvNormAct, BatchNormAct2d, ClassifierHead, DropPath,\
+    create_attn, create_norm_act_layer, get_norm_act_layer


 # model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 &
@ -189,23 +189,23 @@ class OsaBlock(nn.Module):
        next_in_chs = in_chs
        if self.depthwise and next_in_chs != mid_chs:
            assert not residual
-            self.conv_reduction = ConvBnAct(next_in_chs, mid_chs, 1, **conv_kwargs)
+            self.conv_reduction = ConvNormAct(next_in_chs, mid_chs, 1, **conv_kwargs)
        else:
            self.conv_reduction = None

        mid_convs = []
        for i in range(layer_per_block):
            if self.depthwise:
-                conv = SeparableConvBnAct(mid_chs, mid_chs, **conv_kwargs)
+                conv = SeparableConvNormAct(mid_chs, mid_chs, **conv_kwargs)
            else:
-                conv = ConvBnAct(next_in_chs, mid_chs, 3, **conv_kwargs)
+                conv = ConvNormAct(next_in_chs, mid_chs, 3, **conv_kwargs)
            next_in_chs = mid_chs
            mid_convs.append(conv)
        self.conv_mid = SequentialAppendList(*mid_convs)

        # feature aggregation
        next_in_chs = in_chs + layer_per_block * mid_chs
-        self.conv_concat = ConvBnAct(next_in_chs, out_chs, **conv_kwargs)
+        self.conv_concat = ConvNormAct(next_in_chs, out_chs, **conv_kwargs)

        if attn:
            self.attn = create_attn(attn, out_chs)
@ -283,9 +283,9 @@ class VovNet(nn.Module):

        # Stem module
        last_stem_stride = stem_stride // 2
-        conv_type = SeparableConvBnAct if cfg["depthwise"] else ConvBnAct
+        conv_type = SeparableConvNormAct if cfg["depthwise"] else ConvNormAct
        self.stem = nn.Sequential(*[
-            ConvBnAct(in_chans, stem_chs[0], 3, stride=2, **conv_kwargs),
+            ConvNormAct(in_chans, stem_chs[0], 3, stride=2, **conv_kwargs),
            conv_type(stem_chs[0], stem_chs[1], 3, stride=1, **conv_kwargs),
            conv_type(stem_chs[1], stem_chs[2], 3, stride=last_stem_stride, **conv_kwargs),
        ])
@ -395,12 +395,12 @@ def eca_vovnet39b(pretrained=False, **kwargs):
@register_model
 def ese_vovnet39b_evos(pretrained=False, **kwargs):
    def norm_act_fn(num_features, **nkwargs):
-        return create_norm_act('evonorms0', num_features, jit=False, **nkwargs)
+        return create_norm_act_layer('evonorms0', num_features, jit=False, **nkwargs)
    return _create_vovnet('ese_vovnet39b_evos', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs)


@register_model
 def ese_vovnet99b_iabn(pretrained=False, **kwargs):
-    norm_layer = get_norm_act_layer('iabn')
+    norm_layer = get_norm_act_layer('iabn', act_layer='leaky_relu')
    return _create_vovnet(
        'ese_vovnet99b_iabn', pretrained=pretrained, norm_layer=norm_layer, act_layer=nn.LeakyReLU, **kwargs)
--- a/timm/models/xception_aligned.py
+++ b/timm/models/xception_aligned.py
@ -12,7 +12,7 @@ import torch.nn.functional as F

 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, ConvBnAct, create_conv2d
+from .layers import ClassifierHead, ConvNormAct, create_conv2d, get_norm_act_layer
 from .layers.helpers import to_3tuple
 from .registry import register_model

@ -37,12 +37,14 @@ default_cfgs = dict(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_65-c9ae96e8.pth'),
    xception71=_cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_71-8eec7df1.pth'),
+
+    xception41p=_cfg(url=''),
 )


 class SeparableConv2d(nn.Module):
    def __init__(
-            self, inplanes, planes, kernel_size=3, stride=1, dilation=1, padding='',
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='',
            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        super(SeparableConv2d, self).__init__()
        self.kernel_size = kernel_size
@ -50,31 +52,48 @@ class SeparableConv2d(nn.Module):

        # depthwise convolution
        self.conv_dw = create_conv2d(
-            inplanes, inplanes, kernel_size, stride=stride,
+            in_chs, in_chs, kernel_size, stride=stride,
            padding=padding, dilation=dilation, depthwise=True)
-        self.bn_dw = norm_layer(inplanes)
-        if act_layer is not None:
-            self.act_dw = act_layer(inplace=True)
-        else:
-            self.act_dw = None
+        self.bn_dw = norm_layer(in_chs)
+        self.act_dw = act_layer(inplace=True) if act_layer is not None else nn.Identity()

        # pointwise convolution
-        self.conv_pw = create_conv2d(inplanes, planes, kernel_size=1)
-        self.bn_pw = norm_layer(planes)
-        if act_layer is not None:
-            self.act_pw = act_layer(inplace=True)
-        else:
-            self.act_pw = None
+        self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1)
+        self.bn_pw = norm_layer(out_chs)
+        self.act_pw = act_layer(inplace=True) if act_layer is not None else nn.Identity()

    def forward(self, x):
        x = self.conv_dw(x)
        x = self.bn_dw(x)
-        if self.act_dw is not None:
-            x = self.act_dw(x)
+        x = self.act_dw(x)
        x = self.conv_pw(x)
        x = self.bn_pw(x)
-        if self.act_pw is not None:
-            x = self.act_pw(x)
+        x = self.act_pw(x)
+        return x
+
+
+class PreSeparableConv2d(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='',
+            act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, first_act=True):
+        super(PreSeparableConv2d, self).__init__()
+        norm_act_layer = get_norm_act_layer(norm_layer, act_layer=act_layer)
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+
+        self.norm = norm_act_layer(in_chs, inplace=True) if first_act else nn.Identity()
+        # depthwise convolution
+        self.conv_dw = create_conv2d(
+            in_chs, in_chs, kernel_size, stride=stride,
+            padding=padding, dilation=dilation, depthwise=True)
+
+        # pointwise convolution
+        self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1)
+
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
        return x


@ -88,8 +107,8 @@ class XceptionModule(nn.Module):
        self.out_channels = out_chs[-1]
        self.no_skip = no_skip
        if not no_skip and (self.out_channels != self.in_channels or stride != 1):
-            self.shortcut = ConvBnAct(
-                in_chs, self.out_channels, 1, stride=stride, norm_layer=norm_layer, act_layer=None)
+            self.shortcut = ConvNormAct(
+                in_chs, self.out_channels, 1, stride=stride, norm_layer=norm_layer, apply_act=False)
        else:
            self.shortcut = None

@ -97,7 +116,7 @@ class XceptionModule(nn.Module):
        self.stack = nn.Sequential()
        for i in range(3):
            if start_with_relu:
-                self.stack.add_module(f'act{i + 1}', nn.ReLU(inplace=i > 0))
+                self.stack.add_module(f'act{i + 1}', act_layer(inplace=i > 0))
            self.stack.add_module(f'conv{i + 1}', SeparableConv2d(
                in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type,
                act_layer=separable_act_layer, norm_layer=norm_layer))
@ -113,11 +132,42 @@ class XceptionModule(nn.Module):
        return x


+class PreXceptionModule(nn.Module):
+    def __init__(
+            self, in_chs, out_chs, stride=1, dilation=1, pad_type='',
+            no_skip=False, act_layer=nn.ReLU, norm_layer=None):
+        super(PreXceptionModule, self).__init__()
+        out_chs = to_3tuple(out_chs)
+        self.in_channels = in_chs
+        self.out_channels = out_chs[-1]
+        self.no_skip = no_skip
+        if not no_skip and (self.out_channels != self.in_channels or stride != 1):
+            self.shortcut = create_conv2d(in_chs, self.out_channels, 1, stride=stride)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.norm = get_norm_act_layer(norm_layer, act_layer=act_layer)(in_chs, inplace=True)
+        self.stack = nn.Sequential()
+        for i in range(3):
+            self.stack.add_module(f'conv{i + 1}', PreSeparableConv2d(
+                in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type,
+                act_layer=act_layer, norm_layer=norm_layer, first_act=i > 0))
+            in_chs = out_chs[i]
+
+    def forward(self, x):
+        x = self.norm(x)
+        skip = x
+        x = self.stack(x)
+        if not self.no_skip:
+            x = x + self.shortcut(skip)
+        return x
+
+
 class XceptionAligned(nn.Module):
    """Modified Aligned Xception
    """

-    def __init__(self, block_cfg, num_classes=1000, in_chans=3, output_stride=32,
+    def __init__(self, block_cfg, num_classes=1000, in_chans=3, output_stride=32, preact=False,
                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0., global_pool='avg'):
        super(XceptionAligned, self).__init__()
        self.num_classes = num_classes
@ -126,31 +176,33 @@ class XceptionAligned(nn.Module):

        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
        self.stem = nn.Sequential(*[
-            ConvBnAct(in_chans, 32, kernel_size=3, stride=2, **layer_args),
-            ConvBnAct(32, 64, kernel_size=3, stride=1, **layer_args)
+            ConvNormAct(in_chans, 32, kernel_size=3, stride=2, **layer_args),
+            create_conv2d(32, 64, kernel_size=3, stride=1) if preact else
+            ConvNormAct(32, 64, kernel_size=3, stride=1, **layer_args)
        ])

        curr_dilation = 1
        curr_stride = 2
        self.feature_info = []
        self.blocks = nn.Sequential()
+        module_fn = PreXceptionModule if preact else XceptionModule
        for i, b in enumerate(block_cfg):
            b['dilation'] = curr_dilation
            if b['stride'] > 1:
-                self.feature_info += [dict(
-                    num_chs=to_3tuple(b['out_chs'])[-2], reduction=curr_stride, module=f'blocks.{i}.stack.act3')]
+                name = f'blocks.{i}.stack.conv2' if preact else f'blocks.{i}.stack.act3'
+                self.feature_info += [dict(num_chs=to_3tuple(b['out_chs'])[-2], reduction=curr_stride, module=name)]
                next_stride = curr_stride * b['stride']
                if next_stride > output_stride:
                    curr_dilation *= b['stride']
                    b['stride'] = 1
                else:
                    curr_stride = next_stride
-            self.blocks.add_module(str(i), XceptionModule(**b, **layer_args))
+            self.blocks.add_module(str(i), module_fn(**b, **layer_args))
            self.num_features = self.blocks[-1].out_channels

        self.feature_info += [dict(
            num_chs=self.num_features, reduction=curr_stride, module='blocks.' + str(len(self.blocks) - 1))]
-
+        self.act = act_layer(inplace=True) if preact else nn.Identity()
        self.head = ClassifierHead(
            in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)

@ -163,6 +215,7 @@ class XceptionAligned(nn.Module):
    def forward_features(self, x):
        x = self.stem(x)
        x = self.blocks(x)
+        x = self.act(x)
        return x

    def forward(self, x):
@ -236,3 +289,22 @@ def xception71(pretrained=False, **kwargs):
    ]
    model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs)
    return _xception('xception71', pretrained=pretrained, **model_args)
+
+
+@register_model
+def xception41p(pretrained=False, **kwargs):
+    """ Modified Aligned Xception-41 w/ Pre-Act
+    """
+    block_cfg = [
+        # entry flow
+        dict(in_chs=64, out_chs=128, stride=2),
+        dict(in_chs=128, out_chs=256, stride=2),
+        dict(in_chs=256, out_chs=728, stride=2),
+        # middle flow
+        *([dict(in_chs=728, out_chs=728, stride=1)] * 8),
+        # exit flow
+        dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2),
+        dict(in_chs=1024, out_chs=(1536, 1536, 2048), no_skip=True, stride=1),
+    ]
+    model_args = dict(block_cfg=block_cfg, preact=True, norm_layer=nn.BatchNorm2d, **kwargs)
+    return _xception('xception41p', pretrained=pretrained, **model_args)