Monster commit, activation refactor, VoVNet, norm_act improvements, more

* refactor activations into basic PyTorch, jit scripted, and memory efficient custom auto * implement hard-mish, better grad for hard-swish * add initial VovNet V1/V2 impl, fix #151 * VovNet and DenseNet first models to use NormAct layers (support BatchNormAct2d, EvoNorm, InplaceIABN) * Wrap IABN for any models that use it * make more models torchscript compatible (DPN, PNasNet, Res2Net, SelecSLS) and add tests
4 years ago · eb7653614f
parent ff94ffce61
commit eb7653614f
37 changed files with 1467 additions and 316 deletions
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -4,7 +4,7 @@ import platform
 import os
 import fnmatch
-from timm import list_models, create_model
+from timm import list_models, create_model, set_scriptable
 if 'GITHUB_ACTIONS' in os.environ and 'Linux' in platform.system():
@ -53,6 +53,8 @@ def test_model_backward(model_name, batch_size):
    inputs = torch.randn((batch_size, *input_size))
    outputs = model(inputs)
    outputs.mean().backward()
    for n, x in model.named_parameters():
        assert x.grad is not None, f'No gradient for {n}'
    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])
    assert outputs.shape[-1] == 42
@ -83,3 +85,25 @@ def test_model_default_cfgs(model_name, batch_size):
        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
    assert any([k.startswith(classifier) for k in state_dict.keys()]), f'{classifier} not in model params'
    assert any([k.startswith(first_conv) for k in state_dict.keys()]), f'{first_conv} not in model params'
 EXCLUDE_JIT_FILTERS = [
    '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
    'dla*', 'hrnet*',  # hopefully fix at some point
 ]
@pytest.mark.timeout(120)
@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
@pytest.mark.parametrize('batch_size', [1])
 def test_model_forward_torchscript(model_name, batch_size):
    """Run a single forward pass with each model"""
    with set_scriptable(True):
        model = create_model(model_name, pretrained=False)
    model.eval()
    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
    model = torch.jit.script(model)
    outputs = model(torch.randn((batch_size, *input_size)))
    assert outputs.shape[0] == batch_size
    assert not torch.isnan(outputs).any(), 'Output included NaNs'
--- a/timm/init.py
+++ b/timm/init.py
@ -1,2 +1,3 @@
 from .version import __version__
-from .models import create_model, list_models, is_model, list_modules, model_entrypoint
+from .models import create_model, list_models, is_model, list_modules, model_entrypoint, \
    is_scriptable, is_exportable, set_scriptable, set_exportable
--- a/timm/models/init.py
+++ b/timm/models/init.py
@ -20,9 +20,11 @@ from .sknet import *
 from .tresnet import *
 from .resnest import *
 from .regnet import *
 from .vovnet import *
 from .registry import *
 from .factory import create_model
 from .helpers import load_checkpoint, resume_checkpoint
 from .layers import TestTimePoolHead, apply_test_time_pool
 from .layers import convert_splitbn_model
 from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit
--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@ -41,13 +41,13 @@ default_cfgs = {
 class DenseLayer(nn.Module):
-    def __init__(self, num_input_features, growth_rate, bn_size, norm_act_layer=BatchNormAct2d,
+    def __init__(self, num_input_features, growth_rate, bn_size, norm_layer=BatchNormAct2d,
                 drop_rate=0., memory_efficient=False):
        super(DenseLayer, self).__init__()
-        self.add_module('norm1', norm_act_layer(num_input_features)),
+        self.add_module('norm1', norm_layer(num_input_features)),
        self.add_module('conv1', nn.Conv2d(
            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', norm_act_layer(bn_size * growth_rate)),
+        self.add_module('norm2', norm_layer(bn_size * growth_rate)),
        self.add_module('conv2', nn.Conv2d(
            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)),
        self.drop_rate = float(drop_rate)
@ -109,7 +109,7 @@ class DenseLayer(nn.Module):
 class DenseBlock(nn.ModuleDict):
    _version = 2
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, norm_act_layer=nn.ReLU,
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, norm_layer=nn.ReLU,
                 drop_rate=0., memory_efficient=False):
        super(DenseBlock, self).__init__()
        for i in range(num_layers):
@ -117,7 +117,7 @@ class DenseBlock(nn.ModuleDict):
                num_input_features + i * growth_rate,
                growth_rate=growth_rate,
                bn_size=bn_size,
-                norm_act_layer=norm_act_layer,
+                norm_layer=norm_layer,
                drop_rate=drop_rate,
                memory_efficient=memory_efficient,
            )
@ -132,9 +132,9 @@ class DenseBlock(nn.ModuleDict):
 class DenseTransition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features, norm_act_layer=nn.BatchNorm2d, aa_layer=None):
+    def __init__(self, num_input_features, num_output_features, norm_layer=nn.BatchNorm2d, aa_layer=None):
        super(DenseTransition, self).__init__()
-        self.add_module('norm', norm_act_layer(num_input_features))
+        self.add_module('norm', norm_layer(num_input_features))
        self.add_module('conv', nn.Conv2d(
            num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
        if aa_layer is not None:
@ -160,7 +160,7 @@ class DenseNet(nn.Module):
    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), bn_size=4, stem_type='',
                 num_classes=1000, in_chans=3, global_pool='avg',
-                 norm_act_layer=BatchNormAct2d, aa_layer=None, drop_rate=0, memory_efficient=False):
+                 norm_layer=BatchNormAct2d, aa_layer=None, drop_rate=0, memory_efficient=False):
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        super(DenseNet, self).__init__()
@ -181,17 +181,17 @@ class DenseNet(nn.Module):
                stem_chs_2 = num_init_features if 'narrow' in stem_type else 6 * (growth_rate // 4)
            self.features = nn.Sequential(OrderedDict([
                ('conv0', nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False)),
-                ('norm0', norm_act_layer(stem_chs_1)),
+                ('norm0', norm_layer(stem_chs_1)),
                ('conv1', nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False)),
-                ('norm1', norm_act_layer(stem_chs_2)),
+                ('norm1', norm_layer(stem_chs_2)),
                ('conv2', nn.Conv2d(stem_chs_2, num_init_features, 3, stride=1, padding=1, bias=False)),
-                ('norm2', norm_act_layer(num_init_features)),
+                ('norm2', norm_layer(num_init_features)),
                ('pool0', stem_pool),
            ]))
        else:
            self.features = nn.Sequential(OrderedDict([
                ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-                ('norm0', norm_act_layer(num_init_features)),
+                ('norm0', norm_layer(num_init_features)),
                ('pool0', stem_pool),
            ]))
@ -203,7 +203,7 @@ class DenseNet(nn.Module):
                num_input_features=num_features,
                bn_size=bn_size,
                growth_rate=growth_rate,
-                norm_act_layer=norm_act_layer,
+                norm_layer=norm_layer,
                drop_rate=drop_rate,
                memory_efficient=memory_efficient
            )
@ -212,12 +212,12 @@ class DenseNet(nn.Module):
            if i != len(block_config) - 1:
                trans = DenseTransition(
                    num_input_features=num_features, num_output_features=num_features // 2,
-                    norm_act_layer=norm_act_layer)
+                    norm_layer=norm_layer)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2
        # Final batch norm
-        self.features.add_module('norm5', norm_act_layer(num_features))
+        self.features.add_module('norm5', norm_layer(num_features))
        # Linear layer
        self.num_features = num_features
@ -346,7 +346,7 @@ def densenet121d_evob(pretrained=False, **kwargs):
        return create_norm_act('EvoNormBatch', num_features, jit=True, **kwargs)
    model = _densenet(
        'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
-        norm_act_layer=norm_act_fn, pretrained=pretrained, **kwargs)
+        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
    return model
@ -359,7 +359,7 @@ def densenet121d_evos(pretrained=False, **kwargs):
        return create_norm_act('EvoNormSample', num_features, jit=True, **kwargs)
    model = _densenet(
        'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
-        norm_act_layer=norm_act_fn, pretrained=pretrained, **kwargs)
+        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
    return model
@ -372,7 +372,7 @@ def densenet121d_iabn(pretrained=False, **kwargs):
        return create_norm_act('iabn', num_features, **kwargs)
    model = _densenet(
        'densenet121tn', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
-        norm_act_layer=norm_act_fn, pretrained=pretrained, **kwargs)
+        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
    return model
--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@ -10,6 +10,7 @@ from __future__ import division
 from __future__ import print_function
 from collections import OrderedDict
 from typing import Union, Optional, List, Tuple
 import torch
 import torch.nn as nn
@ -54,8 +55,19 @@ class CatBnAct(nn.Module):
        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
        self.act = activation_fn
    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
-        x = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> (torch.Tensor)
        pass
    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
        # type: (torch.Tensor) -> (torch.Tensor)
        pass
    def forward(self, x):
        if isinstance(x, tuple):
            x = torch.cat(x, dim=1)
        return self.act(self.bn(x))
@ -107,6 +119,8 @@ class DualPathBlock(nn.Module):
            self.key_stride = 1
            self.has_proj = False
        self.c1x1_w_s1 = None
        self.c1x1_w_s2 = None
        if self.has_proj:
            # Using different member names here to allow easier parameter key matching for conversion
            if self.key_stride == 2:
@ -115,6 +129,7 @@ class DualPathBlock(nn.Module):
            else:
                self.c1x1_w_s1 = BnActConv2d(
                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1)
        self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1)
        self.c3x3_b = BnActConv2d(
            in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3,
@ -125,27 +140,46 @@ class DualPathBlock(nn.Module):
            self.c1x1_c2 = nn.Conv2d(num_3x3_b, inc, kernel_size=1, bias=False)
        else:
            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1)
            self.c1x1_c1 = None
            self.c1x1_c2 = None
    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
-        x_in = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]
-        if self.has_proj:
+        pass
-            if self.key_stride == 2:
+
-                x_s = self.c1x1_w_s2(x_in)
+    @torch.jit._overload_method  # noqa: F811
-            else:
+    def forward(self, x):
-                x_s = self.c1x1_w_s1(x_in)
+        # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
-            x_s1 = x_s[:, :self.num_1x1_c, :, :]
+        pass
-            x_s2 = x_s[:, self.num_1x1_c:, :, :]
+
    def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor]:
        if isinstance(x, tuple):
            x_in = torch.cat(x, dim=1)
        else:
            x_in = x
        if self.c1x1_w_s1 is None and self.c1x1_w_s2 is None:
            # self.has_proj == False, torchscript requires condition on module == None
            x_s1 = x[0]
            x_s2 = x[1]
        else:
            # self.has_proj == True
            if self.c1x1_w_s1 is not None:
                # self.key_stride = 1
                x_s = self.c1x1_w_s1(x_in)
            else:
                # self.key_stride = 2
                x_s = self.c1x1_w_s2(x_in)
            x_s1 = x_s[:, :self.num_1x1_c, :, :]
            x_s2 = x_s[:, self.num_1x1_c:, :, :]
        x_in = self.c1x1_a(x_in)
        x_in = self.c3x3_b(x_in)
-        if self.b:
+        x_in = self.c1x1_c(x_in)
-            x_in = self.c1x1_c(x_in)
+        if self.c1x1_c1 is not None:
            # self.b == True, using None check for torchscript compat
            out1 = self.c1x1_c1(x_in)
            out2 = self.c1x1_c2(x_in)
        else:
            x_in = self.c1x1_c(x_in)
            out1 = x_in[:, :self.num_1x1_c, :, :]
            out2 = x_in[:, self.num_1x1_c:, :, :]
        resid = x_s1 + out1
@ -167,11 +201,9 @@ class DPN(nn.Module):
        # conv1
        if small:
-            blocks['conv1_1'] = InputBlock(
+            blocks['conv1_1'] = InputBlock(num_init_features, in_chans=in_chans, kernel_size=3, padding=1)
                num_init_features, in_chans=in_chans, kernel_size=3, padding=1)
        else:
-            blocks['conv1_1'] = InputBlock(
+            blocks['conv1_1'] = InputBlock(num_init_features, in_chans=in_chans, kernel_size=7, padding=3)
                num_init_features, in_chans=in_chans, kernel_size=7, padding=3)
        # conv2
        bw = 64 * bw_factor
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -24,11 +24,15 @@ An implementation of EfficienNet that covers variety of related models with effi
 Hacked together by Ross Wightman
 """
 import torch.nn as nn
 import torch.nn.functional as F
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .efficientnet_builder import *
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
 from .feature_hooks import FeatureHooks
 from .helpers import load_pretrained, adapt_model_from_file
-from .layers import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d, create_conv2d
 from .registry import register_model
 __all__ = ['EfficientNet']
@ -631,7 +635,7 @@ def _gen_mobilenet_v2(
        fix_stem=fix_stem_head,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=nn.ReLU6,
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
        **kwargs
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -741,7 +745,7 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
        num_features=round_channels(1280, channel_multiplier, 8, None),
        stem_size=32,
        channel_multiplier=channel_multiplier,
-        act_layer=Swish,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
        norm_kwargs=resolve_bn_args(kwargs),
        variant=variant,
        **kwargs,
@ -772,7 +776,7 @@ def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0
        stem_size=32,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=nn.ReLU,
+        act_layer=resolve_act_layer(kwargs, 'relu'),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -802,7 +806,7 @@ def _gen_efficientnet_condconv(
        stem_size=32,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=Swish,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -842,7 +846,7 @@ def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0
        stem_size=32,
        fix_stem=True,
        channel_multiplier=channel_multiplier,
-        act_layer=nn.ReLU6,
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
        norm_kwargs=resolve_bn_args(kwargs),
        **kwargs,
    )
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from .layers.activations import sigmoid
 from .layers import create_conv2d, drop_path
 from .layers import create_conv2d, drop_path, get_act_layer
 from .layers.activations import sigmoid
 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
 # papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
@ -52,6 +52,13 @@ def resolve_se_args(kwargs, in_chs, act_layer=None):
    return se_kwargs
 def resolve_act_layer(kwargs, default='relu'):
    act_layer = kwargs.pop('act_layer', default)
    if isinstance(act_layer, str):
        act_layer = get_act_layer(act_layer)
    return act_layer
 def make_divisible(v, divisor=8, min_value=None):
    min_value = min_value or divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
@ -213,7 +220,7 @@ class InvertedResidual(nn.Module):
        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
        self.drop_path_rate = drop_path_rate
-
+        print(act_layer)
        # Point-wise expansion
        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -1,13 +1,15 @@
 import logging
 import math
 import re
-from collections.__init__ import OrderedDict
+from collections import OrderedDict
 from copy import deepcopy
 import torch.nn as nn
-from .layers import CondConv2d, get_condconv_initializer
+
 from .layers.activations import HardSwish, Swish
 from .efficientnet_blocks import *
 from .layers import CondConv2d, get_condconv_initializer
 __all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights"]
 def _parse_ksize(ss):
@ -57,13 +59,13 @@ def _decode_block_str(block_str):
            key = op[0]
            v = op[1:]
            if v == 're':
-                value = nn.ReLU
+                value = get_act_layer('relu')
            elif v == 'r6':
-                value = nn.ReLU6
+                value = get_act_layer('relu6')
            elif v == 'hs':
-                value = HardSwish
+                value = get_act_layer('hard_swish')
            elif v == 'sw':
-                value = Swish
+                value = get_act_layer('swish')
            else:
                continue
            options[key] = value
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -1,25 +1,28 @@
 from .padding import get_padding
 from .pool2d_same import AvgPool2dSame
 from .conv2d_same import Conv2dSame
 from .conv_bn_act import ConvBnAct
 from .mixed_conv2d import MixedConv2d
 from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .pool2d_same import create_pool2d
 from .create_conv2d import create_conv2d
 from .create_attn import create_attn
 from .selective_kernel import SelectiveKernelConv
 from .se import SEModule
 from .eca import EcaModule, CecaModule
 from .activations import *
 from .adaptive_avgmax_pool import \
    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
 from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
 from .test_time_pool import TestTimePoolHead, apply_test_time_pool
 from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
 from .anti_aliasing import AntiAliasDownsampleLayer
 from .space_to_depth import SpaceToDepthModule
 from .blur_pool import BlurPool2d
-from .norm_act import BatchNormAct2d
+from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .config import is_exportable, is_scriptable, set_exportable, set_scriptable, is_no_jit, set_no_jit
 from .conv2d_same import Conv2dSame
 from .conv_bn_act import ConvBnAct
 from .create_act import create_act_layer, get_act_layer, get_act_fn
 from .create_attn import create_attn
 from .create_conv2d import create_conv2d
 from .create_norm_act import create_norm_act, get_norm_act_layer
 from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
 from .eca import EcaModule, CecaModule
 from .evo_norm import EvoNormBatch2d, EvoNormSample2d
-from .create_norm_act import create_norm_act
+from .inplace_abn import InplaceAbn
 from .mixed_conv2d import MixedConv2d
 from .norm_act import BatchNormAct2d
 from .padding import get_padding
 from .pool2d_same import AvgPool2dSame, create_pool2d
 from .se import SEModule
 from .selective_kernel import SelectiveKernelConv
 from .separable_conv import SeparableConv2d, SeparableConvBnAct
 from .space_to_depth import SpaceToDepthModule
 from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
 from .test_time_pool import TestTimePoolHead, apply_test_time_pool
 from .weight_init import trunc_normal_
--- a/timm/models/layers/activations.py
+++ b/timm/models/layers/activations.py
@ -6,85 +6,15 @@ easily be swapped. All have an `inplace` arg even if not used.
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
-_USE_MEM_EFFICIENT_ISH = True
+def swish(x, inplace: bool = False):
-if _USE_MEM_EFFICIENT_ISH:
+    """Swish - Described in: https://arxiv.org/abs/1710.05941
-    # This version reduces memory overhead of Swish during training by
+    """
-    # recomputing torch.sigmoid(x) in backward instead of saving it.
+    return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
    @torch.jit.script
    def swish_jit_fwd(x):
        return x.mul(torch.sigmoid(x))
    @torch.jit.script
    def swish_jit_bwd(x, grad_output):
        x_sigmoid = torch.sigmoid(x)
        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
    class SwishJitAutoFn(torch.autograd.Function):
        """ torch.jit.script optimised Swish
        Inspired by conversation btw Jeremy Howard & Adam Pazske
        https://twitter.com/jeremyphoward/status/1188251041835315200
        """
        @staticmethod
        def forward(ctx, x):
            ctx.save_for_backward(x)
            return swish_jit_fwd(x)
        @staticmethod
        def backward(ctx, grad_output):
            x = ctx.saved_tensors[0]
            return swish_jit_bwd(x, grad_output)
    def swish(x, _inplace=False):
        return SwishJitAutoFn.apply(x)
    @torch.jit.script
    def mish_jit_fwd(x):
        return x.mul(torch.tanh(F.softplus(x)))
    @torch.jit.script
    def mish_jit_bwd(x, grad_output):
        x_sigmoid = torch.sigmoid(x)
        x_tanh_sp = F.softplus(x).tanh()
        return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
    class MishJitAutoFn(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x):
            ctx.save_for_backward(x)
            return mish_jit_fwd(x)
        @staticmethod
        def backward(ctx, grad_output):
            x = ctx.saved_tensors[0]
            return mish_jit_bwd(x, grad_output)
    def mish(x, _inplace=False):
        return MishJitAutoFn.apply(x)
 else:
    def swish(x, inplace: bool = False):
        """Swish - Described in: https://arxiv.org/abs/1710.05941
        """
        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
    def mish(x, _inplace: bool = False):
        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
        """
        return x.mul(F.softplus(x).tanh())
 class Swish(nn.Module):
@ -96,13 +26,21 @@ class Swish(nn.Module):
        return swish(x, self.inplace)
 def mish(x, inplace: bool = False):
    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
    NOTE: I don't have a working inplace variant
    """
    return x.mul(F.softplus(x).tanh())
 class Mish(nn.Module):
    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
    """
    def __init__(self, inplace: bool = False):
        super(Mish, self).__init__()
        self.inplace = inplace
    def forward(self, x):
-        return mish(x, self.inplace)
+        return mish(x)
 def sigmoid(x, inplace: bool = False):
@ -162,3 +100,22 @@ class HardSigmoid(nn.Module):
    def forward(self, x):
        return hard_sigmoid(x, self.inplace)
 def hard_mish(x, inplace: bool = False):
    """ Hard Mish
    Experimental, based on notes by Mish author Diganta Misra at
      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
    """
    if inplace:
        return x.mul_(0.5 * (x + 2).clamp(min=0, max=2))
    else:
        return 0.5 * x * (x + 2).clamp(min=0, max=2)
 class HardMish(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardMish, self).__init__()
        self.inplace = inplace
    def forward(self, x):
        return hard_mish(x, self.inplace)
--- a/timm/models/layers/activations_jit.py
+++ b/timm/models/layers/activations_jit.py
@ -0,0 +1,90 @@
 """ Activations
 A collection of jit-scripted activations fn and modules with a common interface so that they can
 easily be swapped. All have an `inplace` arg even if not used.
 All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
 currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
 versions if they contain in-place ops.
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@torch.jit.script
 def swish_jit(x, inplace: bool = False):
    """Swish - Described in: https://arxiv.org/abs/1710.05941
    """
    return x.mul(x.sigmoid())
@torch.jit.script
 def mish_jit(x, _inplace: bool = False):
    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
    """
    return x.mul(F.softplus(x).tanh())
 class SwishJit(nn.Module):
    def __init__(self, inplace: bool = False):
        super(SwishJit, self).__init__()
    def forward(self, x):
        return swish_jit(x)
 class MishJit(nn.Module):
    def __init__(self, inplace: bool = False):
        super(MishJit, self).__init__()
    def forward(self, x):
        return mish_jit(x)
@torch.jit.script
 def hard_sigmoid_jit(x, inplace: bool = False):
    # return F.relu6(x + 3.) / 6.
    return (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
 class HardSigmoidJit(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardSigmoidJit, self).__init__()
    def forward(self, x):
        return hard_sigmoid_jit(x)
@torch.jit.script
 def hard_swish_jit(x, inplace: bool = False):
    # return x * (F.relu6(x + 3.) / 6)
    return x * (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
 class HardSwishJit(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardSwishJit, self).__init__()
    def forward(self, x):
        return hard_swish_jit(x)
@torch.jit.script
 def hard_mish_jit(x, inplace: bool = False):
    """ Hard Mish
    Experimental, based on notes by Mish author Diganta Misra at
      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
    """
    return 0.5 * x * (x + 2).clamp(min=0, max=2)
 class HardMishJit(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardMishJit, self).__init__()
    def forward(self, x):
        return hard_mish_jit(x)
--- a/timm/models/layers/activations_me.py
+++ b/timm/models/layers/activations_me.py
@ -0,0 +1,208 @@
 """ Activations (memory-efficient w/ custom autograd)
 A collection of activations fn and modules with a common interface so that they can
 easily be swapped. All have an `inplace` arg even if not used.
 These activations are not compatible with jit scripting or ONNX export of the model, please use either
 the JIT or basic versions of the activations.
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@torch.jit.script
 def swish_jit_fwd(x):
    return x.mul(torch.sigmoid(x))
@torch.jit.script
 def swish_jit_bwd(x, grad_output):
    x_sigmoid = torch.sigmoid(x)
    return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
 class SwishJitAutoFn(torch.autograd.Function):
    """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
    Inspired by conversation btw Jeremy Howard & Adam Pazske
    https://twitter.com/jeremyphoward/status/1188251041835315200
    """
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return swish_jit_fwd(x)
    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return swish_jit_bwd(x, grad_output)
 def swish_me(x, inplace=False):
    return SwishJitAutoFn.apply(x)
 class SwishMe(nn.Module):
    def __init__(self, inplace: bool = False):
        super(SwishMe, self).__init__()
    def forward(self, x):
        return SwishJitAutoFn.apply(x)
@torch.jit.script
 def mish_jit_fwd(x):
    return x.mul(torch.tanh(F.softplus(x)))
@torch.jit.script
 def mish_jit_bwd(x, grad_output):
    x_sigmoid = torch.sigmoid(x)
    x_tanh_sp = F.softplus(x).tanh()
    return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
 class MishJitAutoFn(torch.autograd.Function):
    """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
    A memory efficient, jit scripted variant of Mish
    """
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return mish_jit_fwd(x)
    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return mish_jit_bwd(x, grad_output)
 def mish_me(x, inplace=False):
    return MishJitAutoFn.apply(x)
 class MishMe(nn.Module):
    def __init__(self, inplace: bool = False):
        super(MishMe, self).__init__()
    def forward(self, x):
        return MishJitAutoFn.apply(x)
@torch.jit.script
 def hard_sigmoid_jit_fwd(x, inplace: bool = False):
    return (x + 3).clamp(min=0, max=6).div(6.)
@torch.jit.script
 def hard_sigmoid_jit_bwd(x, grad_output):
    m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
    return grad_output * m
 class HardSigmoidJitAutoFn(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return hard_sigmoid_jit_fwd(x)
    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return hard_sigmoid_jit_bwd(x, grad_output)
 def hard_sigmoid_me(x, inplace: bool = False):
    return HardSigmoidJitAutoFn.apply(x)
 class HardSigmoidMe(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardSigmoidMe, self).__init__()
    def forward(self, x):
        return HardSigmoidJitAutoFn.apply(x)
@torch.jit.script
 def hard_swish_jit_fwd(x):
    return x * (x + 3).clamp(min=0, max=6).div(6.)
@torch.jit.script
 def hard_swish_jit_bwd(x, grad_output):
    m = torch.ones_like(x) * (x >= 3.)
    m = torch.where((x >= -3.) & (x <= 3.),  x / 3. + .5, m)
    return grad_output * m
 class HardSwishJitAutoFn(torch.autograd.Function):
    """A memory efficient, jit-scripted HardSwish activation"""
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return hard_swish_jit_fwd(x)
    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return hard_swish_jit_bwd(x, grad_output)
 def hard_swish_me(x, inplace=False):
    return HardSwishJitAutoFn.apply(x)
 class HardSwishMe(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardSwishMe, self).__init__()
    def forward(self, x):
        return HardSwishJitAutoFn.apply(x)
@torch.jit.script
 def hard_mish_jit_fwd(x):
    return 0.5 * x * (x + 2).clamp(min=0, max=2)
@torch.jit.script
 def hard_mish_jit_bwd(x, grad_output):
    m = torch.ones_like(x) * (x >= -2.)
    m = torch.where((x >= -2.) & (x <= 0.), x + 1., m)
    return grad_output * m
 class HardMishJitAutoFn(torch.autograd.Function):
    """ A memory efficient, jit scripted variant of Hard Mish
    Experimental, based on notes by Mish author Diganta Misra at
      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
    """
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return mish_jit_fwd(x)
    @staticmethod
    def backward(ctx, grad_output):
        x = ctx.saved_tensors[0]
        return mish_jit_bwd(x, grad_output)
 def hard_mish_me(x, inplace: bool = False):
    return HardMishJitAutoFn.apply(x)
 class HardMishMe(nn.Module):
    def __init__(self, inplace: bool = False):
        super(HardMishMe, self).__init__()
    def forward(self, x):
        return HardMishJitAutoFn.apply(x)
--- a/timm/models/layers/cond_conv2d.py
+++ b/timm/models/layers/cond_conv2d.py
@ -15,7 +15,7 @@ from torch.nn import functional as F
 from .helpers import tup_pair
 from .conv2d_same import conv2d_same
-from timm.models.layers.padding import get_padding_value
+from .padding import get_padding_value
 def get_condconv_initializer(initializer, num_experts, expert_shape):
--- a/timm/models/layers/config.py
+++ b/timm/models/layers/config.py
@ -0,0 +1,74 @@
 """ Model / Layer Config Singleton
 """
 from typing import Any
 __all__ = ['is_exportable', 'is_scriptable', 'set_exportable', 'set_scriptable', 'is_no_jit', 'set_no_jit']
 # Set to True if prefer to have layers with no jit optimization (includes activations)
 _NO_JIT = False
 # Set to True if prefer to have activation layers with no jit optimization
 _NO_ACTIVATION_JIT = False
 # Set to True if exporting a model with Same padding via ONNX
 _EXPORTABLE = False
 # Set to True if wanting to use torch.jit.script on a model
 _SCRIPTABLE = False
 def is_no_jit():
    return _NO_JIT
 class set_no_jit:
    def __init__(self, mode: bool) -> None:
        global _NO_JIT
        self.prev = _NO_JIT
        _NO_JIT = mode
    def __enter__(self) -> None:
        pass
    def __exit__(self, *args: Any) -> bool:
        global _NO_JIT
        _NO_JIT = self.prev
        return False
 def is_exportable():
    return _EXPORTABLE
 class set_exportable:
    def __init__(self, mode: bool) -> None:
        global _EXPORTABLE
        self.prev = _EXPORTABLE
        _EXPORTABLE = mode
    def __enter__(self) -> None:
        pass
    def __exit__(self, *args: Any) -> bool:
        global _EXPORTABLE
        _EXPORTABLE = self.prev
        return False
 def is_scriptable():
    return _SCRIPTABLE
 class set_scriptable:
    def __init__(self, mode: bool) -> None:
        global _SCRIPTABLE
        self.prev = _SCRIPTABLE
        _SCRIPTABLE = mode
    def __enter__(self) -> None:
        pass
    def __exit__(self, *args: Any) -> bool:
        global _SCRIPTABLE
        _SCRIPTABLE = self.prev
        return False
--- a/timm/models/layers/conv2d_same.py
+++ b/timm/models/layers/conv2d_same.py
@ -7,8 +7,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from typing import Tuple, Optional
-from timm.models.layers.padding import get_padding_value
+from .padding import pad_same, get_padding_value
 from .padding import pad_same
 def conv2d_same(
--- a/timm/models/layers/conv_bn_act.py
+++ b/timm/models/layers/conv_bn_act.py
@ -4,33 +4,28 @@ Hacked together by Ross Wightman
 """
 from torch import nn as nn
-from timm.models.layers import get_padding
+from .create_conv2d import create_conv2d
 from .create_norm_act import convert_norm_act_type
 class ConvBnAct(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1,
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
-                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, act_layer=nn.ReLU, apply_act=True,
                 drop_block=None, aa_layer=None):
        super(ConvBnAct, self).__init__()
        padding = get_padding(kernel_size, stride, dilation)  # assuming PyTorch style padding for this block
        use_aa = aa_layer is not None
-        self.conv = nn.Conv2d(
+        self.conv = create_conv2d(
-            in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1 if use_aa else stride,
+            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
            padding=padding, dilation=dilation, groups=groups, bias=False)
-        self.bn = norm_layer(out_channels)
+
        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
        self.drop_block = drop_block
        if act_layer is not None:
            self.act = act_layer(inplace=True)
        else:
            self.act = None
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.drop_block is not None:
            x = self.drop_block(x)
        if self.act is not None:
            x = self.act(x)
        if self.aa is not None:
            x = self.aa(x)
        return x
--- a/timm/models/layers/create_act.py
+++ b/timm/models/layers/create_act.py
@ -0,0 +1,103 @@
 from .activations import *
 from .activations_jit import *
 from .activations_me import *
 from .config import is_exportable, is_scriptable, is_no_jit
 _ACT_FN_DEFAULT = dict(
    swish=swish,
    mish=mish,
    relu=F.relu,
    relu6=F.relu6,
    sigmoid=sigmoid,
    tanh=tanh,
    hard_sigmoid=hard_sigmoid,
    hard_swish=hard_swish,
    hard_mish=hard_mish,
 )
 _ACT_FN_JIT = dict(
    swish=swish_jit,
    mish=mish_jit,
    hard_sigmoid=hard_sigmoid_jit,
    hard_swish=hard_swish_jit,
    hard_mish=hard_mish_jit
 )
 _ACT_FN_ME = dict(
    swish=swish_me,
    mish=mish_me,
    hard_sigmoid=hard_sigmoid_me,
    hard_swish=hard_swish_me,
    hard_mish=hard_mish_me,
 )
 _ACT_LAYER_DEFAULT = dict(
    swish=Swish,
    mish=Mish,
    relu=nn.ReLU,
    relu6=nn.ReLU6,
    sigmoid=Sigmoid,
    tanh=Tanh,
    hard_sigmoid=HardSigmoid,
    hard_swish=HardSwish,
    hard_mish=HardMish,
 )
 _ACT_LAYER_JIT = dict(
    swish=SwishJit,
    mish=MishJit,
    hard_sigmoid=HardSigmoidJit,
    hard_swish=HardSwishJit,
    hard_mish=HardMishJit
 )
 _ACT_LAYER_ME = dict(
    swish=SwishMe,
    mish=MishMe,
    hard_sigmoid=HardSigmoidMe,
    hard_swish=HardSwishMe,
    hard_mish=HardMishMe,
 )
 def get_act_fn(name='relu'):
    """ Activation Function Factory
    Fetching activation fns by name with this function allows export or torch script friendly
    functions to be returned dynamically based on current config.
    """
    if not name:
        return None
    if not (is_no_jit() or is_exportable() or is_scriptable()):
        # If not exporting or scripting the model, first look for a memory-efficient version with
        # custom autograd, then fallback
        if name in _ACT_FN_ME:
            return _ACT_FN_ME[name]
    if not is_no_jit():
        if name in _ACT_FN_JIT:
            return _ACT_FN_JIT[name]
    return _ACT_FN_DEFAULT[name]
 def get_act_layer(name='relu'):
    """ Activation Layer Factory
    Fetching activation layers by name with this function allows export or torch script friendly
    functions to be returned dynamically based on current config.
    """
    if not name:
        return None
    if not (is_no_jit() or is_exportable() or is_scriptable()):
        if name in _ACT_LAYER_ME:
            return _ACT_LAYER_ME[name]
    if not is_no_jit():
        if name in _ACT_LAYER_JIT:
            return _ACT_LAYER_JIT[name]
    return _ACT_LAYER_DEFAULT[name]
 def create_act_layer(name, inplace=False, **kwargs):
    act_layer = get_act_layer(name)
    if act_layer is not None:
        return act_layer(inplace=inplace, **kwargs)
    else:
        return None
--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -3,7 +3,7 @@
 Hacked together by Ross Wightman
 """
 import torch
-from .se import SEModule
+from .se import SEModule, EffectiveSEModule
 from .eca import EcaModule, CecaModule
 from .cbam import CbamModule, LightCbamModule
@ -15,6 +15,8 @@ def create_attn(attn_type, channels, **kwargs):
            attn_type = attn_type.lower()
            if attn_type == 'se':
                module_cls = SEModule
            elif attn_type == 'ese':
                module_cls = EffectiveSEModule
            elif attn_type == 'eca':
                module_cls = EcaModule
            elif attn_type == 'ceca':
--- a/timm/models/layers/create_conv2d.py
+++ b/timm/models/layers/create_conv2d.py
@ -8,23 +8,23 @@ from .cond_conv2d import CondConv2d
 from .conv2d_same import create_conv2d_pad
-def create_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
    """ Select a 2d convolution implementation based on arguments
    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
    Used extensively by EfficientNet, MobileNetv3 and related networks.
    """
    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
    if isinstance(kernel_size, list):
        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
        assert 'groups' not in kwargs  # MixedConv groups are defined by kernel list
        # We're going to use only lists for defining the MixedConv2d kernel groups,
        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
    else:
        depthwise = kwargs.pop('depthwise', False)
-        groups = out_chs if depthwise else 1
+        groups = out_channels if depthwise else kwargs.pop('groups', 1)
        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
        else:
-            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
    return m
--- a/timm/models/layers/create_norm_act.py
+++ b/timm/models/layers/create_norm_act.py
@ -1,37 +1,64 @@
 import types
 import functools
 import torch
 import torch.nn as nn
 from .evo_norm import EvoNormBatch2d, EvoNormSample2d
-from .norm_act import BatchNormAct2d
+from .norm_act import BatchNormAct2d, GroupNormAct
-try:
+from .inplace_abn import InplaceAbn
    from inplace_abn import InPlaceABN
    has_iabn = True
 except ImportError:
    has_iabn = False
 _NORM_ACT_TYPES = {BatchNormAct2d, GroupNormAct, EvoNormBatch2d, EvoNormSample2d, InplaceAbn}
-def create_norm_act(layer_type, num_features, jit=False, **kwargs):
+
-    layer_parts = layer_type.split('_')
+def get_norm_act_layer(layer_class):
-    assert len(layer_parts) in (1, 2)
+    layer_class = layer_class.replace('_', '').lower()
-    layer_class = layer_parts[0].lower()
+    if layer_class.startswith("batchnorm"):
-    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection
+        layer = BatchNormAct2d
-
+    elif layer_class.startswith("groupnorm"):
-    if layer_class == "batchnormact":
+        layer = GroupNormAct
        layer = BatchNormAct2d(num_features, **kwargs) # defaults to RELU of no kwargs override
    elif layer_class == "batchnormrelu":
        assert 'act_layer' not in kwargs
        layer = BatchNormAct2d(num_features, act_layer=nn.ReLU, **kwargs)
    elif layer_class == "evonormbatch":
-        layer = EvoNormBatch2d(num_features, **kwargs)
+        layer = EvoNormBatch2d
    elif layer_class == "evonormsample":
-        layer = EvoNormSample2d(num_features, **kwargs)
+        layer = EvoNormSample2d
    elif layer_class == "iabn" or layer_class == "inplaceabn":
-        if not has_iabn:
+        layer = InplaceAbn
            raise ImportError(
                "Pplease install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
        layer = InPlaceABN(num_features, **kwargs)
    else:
        assert False, "Invalid norm_act layer (%s)" % layer_class
    if jit:
        layer = torch.jit.script(layer)
    return layer
 def create_norm_act(layer_type, num_features, apply_act=True, jit=False, **kwargs):
    layer_parts = layer_type.split('-')  # e.g. batchnorm-leaky_relu
    assert len(layer_parts) in (1, 2)
    layer = get_norm_act_layer(layer_parts[0])
    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection?
    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
    if jit:
        layer_instance = torch.jit.script(layer_instance)
    return layer_instance
 def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None):
    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
    norm_act_args = norm_kwargs.copy() if norm_kwargs else {}
    if isinstance(norm_layer, str):
        norm_act_layer = get_norm_act_layer(norm_layer)
    elif norm_layer in _NORM_ACT_TYPES:
        norm_act_layer = norm_layer
    elif isinstance(norm_layer,  (types.FunctionType, functools.partial)):
        # assuming this is a lambda/fn/bound partial that creates norm_act layer
        norm_act_layer = norm_layer
    else:
        type_name = norm_layer.__name__.lower()
        if type_name.startswith('batchnorm'):
            norm_act_layer = BatchNormAct2d
        elif type_name.startswith('groupnorm'):
            norm_act_layer = GroupNormAct
        else:
            assert False, f"No equivalent norm_act layer for {type_name}"
        # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
        # Newer models will use `apply_act` and likely have `act_layer` arg bound to relevant NormAct types.
        norm_act_args.update(dict(act_layer=act_layer))
    return norm_act_layer, norm_act_args
--- a/timm/models/layers/drop.py
+++ b/timm/models/layers/drop.py
@ -17,8 +17,6 @@ Hacked together by Ross Wightman
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import math
 def drop_block_2d(
--- a/timm/models/layers/evo_norm.py
+++ b/timm/models/layers/evo_norm.py
@ -2,9 +2,9 @@
 An attempt at getting decent performing EvoNorms running in PyTorch.
 While currently faster than other impl, still quite a ways off the built-in BN
-in terms of memory usage and throughput.
+in terms of memory usage and throughput (roughly 5x mem, 1/2 - 1/3x speed).
-Still very much a WIP, fiddling with buffer usage, in-place optimizations, and layouts.
+Still very much a WIP, fiddling with buffer usage, in-place/jit optimizations, and layouts.
 Hacked together by Ross Wightman
 """
@ -14,15 +14,15 @@ import torch.nn as nn
 class EvoNormBatch2d(nn.Module):
-    def __init__(self, num_features, momentum=0.1, nonlin=True, eps=1e-5):
+    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, drop_block=None):
        super(EvoNormBatch2d, self).__init__()
        self.apply_act = apply_act  # apply activation (non-linearity)
        self.momentum = momentum
        self.nonlin = nonlin
        self.eps = eps
        param_shape = (1, num_features, 1, 1)
        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
-        if nonlin:
+        if apply_act:
            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
        self.register_buffer('running_var', torch.ones(1, num_features, 1, 1))
        self.reset_parameters()
@ -30,7 +30,7 @@ class EvoNormBatch2d(nn.Module):
    def reset_parameters(self):
        nn.init.ones_(self.weight)
        nn.init.zeros_(self.bias)
-        if self.nonlin:
+        if self.apply_act:
            nn.init.ones_(self.v)
    def forward(self, x):
@ -40,46 +40,42 @@ class EvoNormBatch2d(nn.Module):
            var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=True)
            self.running_var.copy_(self.momentum * var.detach() + (1 - self.momentum) * self.running_var)
        else:
-            var = self.running_var.clone()
+            var = self.running_var
-        if self.nonlin:
+        if self.apply_act:
            v = self.v.to(dtype=x_type)
-            d = (x * v) + x.var(dim=(2, 3), unbiased=False, keepdim=True).add_(self.eps).sqrt_().to(dtype=x_type)
+            d = (x * v) + (x.var(dim=(2, 3), unbiased=False, keepdim=True) + self.eps).sqrt().to(dtype=x_type)
-            d = d.max(var.add_(self.eps).sqrt_().to(dtype=x_type))
+            d = d.max((var + self.eps).sqrt().to(dtype=x_type))
            x = x / d
-            return x.mul_(self.weight).add_(self.bias)
+        return x * self.weight + self.bias
        else:
            return x.mul(self.weight).add_(self.bias)
 class EvoNormSample2d(nn.Module):
-    def __init__(self, num_features, nonlin=True, groups=8, eps=1e-5):
+    def __init__(self, num_features, apply_act=True, groups=8, eps=1e-5, drop_block=None):
        super(EvoNormSample2d, self).__init__()
-        self.nonlin = nonlin
+        self.apply_act = apply_act  # apply activation (non-linearity)
        self.groups = groups
        self.eps = eps
        param_shape = (1, num_features, 1, 1)
        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
-        if nonlin:
+        if apply_act:
            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
        self.reset_parameters()
    def reset_parameters(self):
        nn.init.ones_(self.weight)
        nn.init.zeros_(self.bias)
-        if self.nonlin:
+        if self.apply_act:
            nn.init.ones_(self.v)
    def forward(self, x):
        assert x.dim() == 4, 'expected 4D input'
        B, C, H, W = x.shape
        assert C % self.groups == 0
-        if self.nonlin:
+        if self.apply_act:
            n = (x * self.v).sigmoid().reshape(B, self.groups, -1)
            x = x.reshape(B, self.groups, -1)
-            x = n / x.var(dim=-1, unbiased=False, keepdim=True).add_(self.eps).sqrt_()
+            x = n / (x.var(dim=-1, unbiased=False, keepdim=True) + self.eps).sqrt()
            x = x.reshape(B, C, H, W)
-            return x.mul_(self.weight).add_(self.bias)
+        return x * self.weight + self.bias
        else:
            return x.mul(self.weight).add_(self.bias)
--- a/timm/models/layers/inplace_abn.py
+++ b/timm/models/layers/inplace_abn.py
@ -0,0 +1,85 @@
 import torch
 from torch import nn as nn
 try:
    from inplace_abn.functions import inplace_abn, inplace_abn_sync
    has_iabn = True
 except ImportError:
    has_iabn = False
    def inplace_abn(x, weight, bias, running_mean, running_var,
                    training=True, momentum=0.1, eps=1e-05, activation="leaky_relu", activation_param=0.01):
        raise ImportError(
            "Please install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
    def inplace_abn_sync(**kwargs):
        inplace_abn(**kwargs)
 class InplaceAbn(nn.Module):
    """Activated Batch Normalization
    This gathers a BatchNorm and an activation function in a single module
    Parameters
    ----------
    num_features : int
        Number of feature channels in the input and output.
    eps : float
        Small constant to prevent numerical issues.
    momentum : float
        Momentum factor applied to compute running statistics.
    affine : bool
        If `True` apply learned scale and shift transformation after normalization.
    act_layer : str or nn.Module type
        Name or type of the activation functions, one of: `leaky_relu`, `elu`
    act_param : float
        Negative slope for the `leaky_relu` activation.
    """
    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
                 act_layer="leaky_relu", act_param=0.01, drop_block=None,):
        super(InplaceAbn, self).__init__()
        self.num_features = num_features
        self.affine = affine
        self.eps = eps
        self.momentum = momentum
        if apply_act:
            if isinstance(act_layer, str):
                assert act_layer in ('leaky_relu', 'elu', 'identity')
                self.act_name = act_layer
            else:
                # convert act layer passed as type to string
                if isinstance(act_layer, nn.ELU):
                    self.act_name = 'elu'
                elif isinstance(act_layer, nn.LeakyReLU):
                    self.act_name = 'leaky_relu'
                else:
                    assert False, f'Invalid act layer {act_layer.__name__} for IABN'
        else:
            self.act_name = 'identity'
        self.act_param = act_param
        if self.affine:
            self.weight = nn.Parameter(torch.ones(num_features))
            self.bias = nn.Parameter(torch.zeros(num_features))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)
        self.register_buffer('running_mean', torch.zeros(num_features))
        self.register_buffer('running_var', torch.ones(num_features))
        self.reset_parameters()
    def reset_parameters(self):
        nn.init.constant_(self.running_mean, 0)
        nn.init.constant_(self.running_var, 1)
        if self.affine:
            nn.init.constant_(self.weight, 1)
            nn.init.constant_(self.bias, 0)
    def forward(self, x):
        output = inplace_abn(
            x, self.weight, self.bias, self.running_mean, self.running_var,
            self.training, self.momentum, self.eps, self.act_name, self.act_param)
        if isinstance(output, tuple):
            output = output[0]
        return output
--- a/timm/models/layers/norm_act.py
+++ b/timm/models/layers/norm_act.py
@ -1,28 +1,33 @@
 """ Normalization + Activation Layers
 """
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
 from .create_act import get_act_layer
 class BatchNormAct2d(nn.BatchNorm2d):
    """BatchNorm + Activation
-    This module performs BatchNorm + Actibation in s manner that will remain bavkwards
+    This module performs BatchNorm + Activation in a manner that will remain backwards
    compatible with weights trained with separate bn, act. This is why we inherit from BN
    instead of composing it as a .bn member.
    """
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
-                 track_running_stats=True, act_layer=nn.ReLU, inplace=True):
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
-        super(BatchNormAct2d, self).__init__(num_features, eps, momentum, affine, track_running_stats)
+        super(BatchNormAct2d, self).__init__(
-        self.act = act_layer(inplace=inplace)
+            num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
-
+        if isinstance(act_layer, str):
-    def forward(self, x):
+            act_layer = get_act_layer(act_layer)
-        # FIXME cannot call parent forward() and maintain jit.script compatibility?
+        if act_layer is not None and apply_act:
-        # x = super(BatchNormAct2d, self).forward(x)
+            self.act = act_layer(inplace=inplace)
-
+        else:
-        # BEGIN nn.BatchNorm2d forward() cut & paste
+            self.act = None
        # self._check_input_dim(x)
    def _forward_jit(self, x):
        """ A cut & paste of the contents of the PyTorch BatchNorm2d forward function
        """
        # exponential_average_factor is self.momentum set to
        # (when it is available) only so that if gets updated
        # in ONNX graph when this node is exported to ONNX.
@ -41,10 +46,40 @@ class BatchNormAct2d(nn.BatchNorm2d):
                    exponential_average_factor = self.momentum
        x = F.batch_norm(
-            x, self.running_mean, self.running_var, self.weight, self.bias,
+                x, self.running_mean, self.running_var, self.weight, self.bias,
-            self.training or not self.track_running_stats,
+                self.training or not self.track_running_stats,
-            exponential_average_factor, self.eps)
+                exponential_average_factor, self.eps)
-        # END BatchNorm2d forward()
+        return x
    @torch.jit.ignore
    def _forward_python(self, x):
        return super(BatchNormAct2d, self).forward(x)
    def forward(self, x):
        # FIXME cannot call parent forward() and maintain jit.script compatibility?
        if torch.jit.is_scripting():
            x = self._forward_jit(x)
        else:
            self._forward_python(x)
        if self.act is not None:
            x = self.act(x)
        return x
-        x = self.act(x)
+class GroupNormAct(nn.GroupNorm):
    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True,
                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
        super(GroupNormAct, self).__init__(num_groups, num_channels, eps=eps, affine=affine)
        if isinstance(act_layer, str):
            act_layer = get_act_layer(act_layer)
        if act_layer is not None and apply_act:
            self.act = act_layer(inplace=inplace)
        else:
            self.act = None
    def forward(self, x):
        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
        if self.act is not None:
            x = self.act(x)
        return x
--- a/timm/models/layers/pool2d_same.py
+++ b/timm/models/layers/pool2d_same.py
@ -6,7 +6,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Union, List, Tuple, Optional
 import math
 from .helpers import tup_pair
 from .padding import pad_same, get_padding_value
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@ -1,9 +1,11 @@
 from torch import nn as nn
 from .create_act import get_act_fn
 class SEModule(nn.Module):
-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None):
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None,
                 gate_fn='hard_sigmoid'):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        reduction_channels = reduction_channels or max(channels // reduction, min_channels)
@ -12,10 +14,27 @@ class SEModule(nn.Module):
        self.act = act_layer(inplace=True)
        self.fc2 = nn.Conv2d(
            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
        self.gate_fn = get_act_fn(gate_fn)
    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.fc1(x_se)
        x_se = self.act(x_se)
        x_se = self.fc2(x_se)
-        return x * x_se.sigmoid()
+        return x * self.gate_fn(x_se)
 class EffectiveSEModule(nn.Module):
    """ 'Effective Squeeze-Excitation
    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
    """
    def __init__(self, channel, gate_fn='hard_sigmoid'):
        super(EffectiveSEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
        self.gate_fn = get_act_fn(gate_fn)
    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.fc(x_se)
        return x * self.gate_fn(x_se, inplace=True)
--- a/timm/models/layers/selective_kernel.py
+++ b/timm/models/layers/selective_kernel.py
@ -4,7 +4,6 @@ Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
--- a/timm/models/layers/separable_conv.py
+++ b/timm/models/layers/separable_conv.py
@ -0,0 +1,51 @@
 from torch import nn as nn
 from .create_conv2d import create_conv2d
 from .create_norm_act import convert_norm_act_type
 class SeparableConvBnAct(nn.Module):
    """ Separable Conv w/ trailing Norm and Activation
    """
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
                 act_layer=nn.ReLU, apply_act=True, drop_block=None):
        super(SeparableConvBnAct, self).__init__()
        norm_kwargs = norm_kwargs or {}
        self.conv_dw = create_conv2d(
            in_channels, int(in_channels * channel_multiplier), kernel_size,
            stride=stride, dilation=dilation, padding=padding, depthwise=True)
        self.conv_pw = create_conv2d(
            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
    def forward(self, x):
        x = self.conv_dw(x)
        x = self.conv_pw(x)
        if self.bn is not None:
            x = self.bn(x)
        return x
 class SeparableConv2d(nn.Module):
    """ Separable Conv
    """
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
                 channel_multiplier=1.0, pw_kernel_size=1):
        super(SeparableConv2d, self).__init__()
        self.conv_dw = create_conv2d(
            in_channels, int(in_channels * channel_multiplier), kernel_size,
            stride=stride, dilation=dilation, padding=padding, depthwise=True)
        self.conv_pw = create_conv2d(
            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
    def forward(self, x):
        x = self.conv_dw(x)
        x = self.conv_pw(x)
        return x
--- a/timm/models/layers/test_time_pool.py
+++ b/timm/models/layers/test_time_pool.py
@ -6,6 +6,7 @@ Hacked together by Ross Wightman
 import logging
 from torch import nn
 import torch.nn.functional as F
 from .adaptive_avgmax_pool import adaptive_avgmax_pool2d
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -7,13 +7,15 @@ Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
 Hacked together by Ross Wightman
 """
 import torch.nn as nn
 import torch.nn.functional as F
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-from .efficientnet_builder import *
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
 from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
 from .feature_hooks import FeatureHooks
 from .helpers import load_pretrained
-from .layers import SelectAdaptivePool2d, create_conv2d
+from .layers import SelectAdaptivePool2d, create_conv2d, get_act_fn, hard_sigmoid
 from .layers.activations import HardSwish, hard_sigmoid
 from .registry import register_model
 __all__ = ['MobileNetV3']
@ -273,8 +275,8 @@ def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kw
        head_bias=False,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=HardSwish,
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
-        se_kwargs=dict(gate_fn=hard_sigmoid, reduce_mid=True, divisor=1),
+        se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=1),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -293,7 +295,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
    if 'small' in variant:
        num_features = 1024
        if 'minimal' in variant:
-            act_layer = nn.ReLU
+            act_layer = resolve_act_layer(kwargs, 'relu')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s2_e1_c16'],
@ -309,7 +311,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
                ['cn_r1_k1_s1_c576'],
            ]
        else:
-            act_layer = HardSwish
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
@ -327,7 +329,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
    else:
        num_features = 1280
        if 'minimal' in variant:
-            act_layer = nn.ReLU
+            act_layer = resolve_act_layer(kwargs, 'relu')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s1_e1_c16'],
@ -345,7 +347,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
                ['cn_r1_k1_s1_c960'],
            ]
        else:
-            act_layer = HardSwish
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@ -43,11 +43,12 @@ class MaxPool(nn.Module):
        self.pool = nn.MaxPool2d(kernel_size, stride=stride, padding=padding)
    def forward(self, x):
-        if self.zero_pad:
+        if self.zero_pad is not None:
            x = self.zero_pad(x)
-        x = self.pool(x)
+            x = self.pool(x)
        if self.zero_pad:
            x = x[:, :, 1:, 1:]
        else:
            x = self.pool(x)
        return x
@ -90,11 +91,12 @@ class BranchSeparables(nn.Module):
    def forward(self, x):
        x = self.relu_1(x)
-        if self.zero_pad:
+        if self.zero_pad is not None:
            x = self.zero_pad(x)
-        x = self.separable_1(x)
+            x = self.separable_1(x)
        if self.zero_pad:
            x = x[:, :, 1:, 1:].contiguous()
        else:
            x = self.separable_1(x)
        x = self.bn_sep_1(x)
        x = self.relu_2(x)
        x = self.separable_2(x)
@ -171,15 +173,14 @@ class CellBase(nn.Module):
        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right
        x_comb_iter_4_left = self.comb_iter_4_left(x_left)
-        if self.comb_iter_4_right:
+        if self.comb_iter_4_right is not None:
            x_comb_iter_4_right = self.comb_iter_4_right(x_right)
        else:
            x_comb_iter_4_right = x_right
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right
        x_out = torch.cat(
-            [x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3,
+            [x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
             x_comb_iter_4], 1)
        return x_out
@ -280,9 +281,8 @@ class Cell(CellBase):
                                                 kernel_size=3, stride=stride,
                                                 zero_pad=zero_pad)
        if is_reduction:
-            self.comb_iter_4_right = ReluConvBn(out_channels_right,
+            self.comb_iter_4_right = ReluConvBn(
-                                                out_channels_right,
+                out_channels_right, out_channels_right, kernel_size=1, stride=stride)
                                                kernel_size=1, stride=stride)
        else:
            self.comb_iter_4_right = None
--- a/timm/models/res2net.py
+++ b/timm/models/res2net.py
@ -77,6 +77,8 @@ class Bottle2neck(nn.Module):
        if self.is_first:
            # FIXME this should probably have count_include_pad=False, but hurts original weights
            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
        else:
            self.pool = None
        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
@ -97,14 +99,22 @@ class Bottle2neck(nn.Module):
        spx = torch.split(out, self.width, 1)
        spo = []
        sp = spx[0]
        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            sp = spx[i] if i == 0 or self.is_first else sp + spx[i]
+            if self.is_first:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = conv(sp)
            sp = bn(sp)
            sp = self.relu(sp)
            spo.append(sp)
        if self.scale > 1:
-            spo.append(self.pool(spx[-1]) if self.is_first else spx[-1])
+            if self.pool is not None:
                # self.is_first == True, None check for torchscript
                spo.append(self.pool(spx[-1]))
            else:
                spo.append(spx[-1])
        out = torch.cat(spo, 1)
        out = self.conv3(out)
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -200,7 +200,6 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
--- a/timm/models/selecsls.py
+++ b/timm/models/selecsls.py
@ -9,6 +9,7 @@ https://arxiv.org/abs/1907.00837
 Based on ResNet implementation in https://github.com/rwightman/pytorch-image-models
 and SelecSLS Net implementation in https://github.com/mehtadushy/SelecSLS-Pytorch
 """
 from typing import List
 import torch
 import torch.nn as nn
@ -52,6 +53,27 @@ default_cfgs = {
 }
 class SequentialList(nn.Sequential):
    def __init__(self, *args):
        super(SequentialList, self).__init__(*args)
    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
        # type: (List[torch.Tensor]) -> (List[torch.Tensor])
        pass
    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
        # type: (torch.Tensor) -> (List[torch.Tensor])
        pass
    def forward(self, x) -> List[torch.Tensor]:
        for module in self:
            x = module(x)
        return x
 def conv_bn(in_chs, out_chs, k=3, stride=1, padding=None, dilation=1):
    if padding is None:
        padding = ((stride - 1) + dilation * (k - 1)) // 2
@ -77,7 +99,7 @@ class SelecSLSBlock(nn.Module):
        self.conv5 = conv_bn(mid_chs, mid_chs // 2, 3)
        self.conv6 = conv_bn(2 * mid_chs + (0 if is_first else skip_chs), out_chs, 1)
-    def forward(self, x):
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
        assert isinstance(x, list)
        assert len(x) in [1, 2]
@ -113,7 +135,7 @@ class SelecSLS(nn.Module):
        super(SelecSLS, self).__init__()
        self.stem = conv_bn(in_chans, 32, stride=2)
-        self.features = nn.Sequential(*[cfg['block'](*block_args) for block_args in cfg['features']])
+        self.features = SequentialList(*[cfg['block'](*block_args) for block_args in cfg['features']])
        self.head = nn.Sequential(*[conv_bn(*conv_args) for conv_args in cfg['head']])
        self.num_features = cfg['num_features']
--- a/timm/models/tresnet.py
+++ b/timm/models/tresnet.py
@ -13,15 +13,9 @@ import torch.nn as nn
 import torch.nn.functional as F
 from .helpers import load_pretrained
-from .layers import SpaceToDepthModule, AntiAliasDownsampleLayer, SelectAdaptivePool2d
+from .layers import SpaceToDepthModule, AntiAliasDownsampleLayer, SelectAdaptivePool2d, InplaceAbn
 from .registry import register_model
 try:
    from inplace_abn import InPlaceABN
    has_iabn = True
 except ImportError:
    has_iabn = False
 __all__ = ['tresnet_m', 'tresnet_l', 'tresnet_xl']
@ -91,37 +85,37 @@ class FastSEModule(nn.Module):
 def IABN2Float(module: nn.Module) -> nn.Module:
    """If `module` is IABN don't use half precision."""
-    if isinstance(module, InPlaceABN):
+    if isinstance(module, InplaceAbn):
        module.float()
    for child in module.children():
        IABN2Float(child)
    return module
-def conv2d_ABN(ni, nf, stride, activation="leaky_relu", kernel_size=3, activation_param=1e-2, groups=1):
+def conv2d_iabn(ni, nf, stride, kernel_size=3, groups=1, act_layer="leaky_relu", act_param=1e-2):
    return nn.Sequential(
        nn.Conv2d(
            ni, nf, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=groups, bias=False),
-        InPlaceABN(num_features=nf, activation=activation, activation_param=activation_param)
+        InplaceAbn(nf, act_layer=act_layer, act_param=act_param)
    )
 class BasicBlock(nn.Module):
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, anti_alias_layer=None):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, aa_layer=None):
        super(BasicBlock, self).__init__()
        if stride == 1:
-            self.conv1 = conv2d_ABN(inplanes, planes, stride=1, activation_param=1e-3)
+            self.conv1 = conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3)
        else:
-            if anti_alias_layer is None:
+            if aa_layer is None:
-                self.conv1 = conv2d_ABN(inplanes, planes, stride=2, activation_param=1e-3)
+                self.conv1 = conv2d_iabn(inplanes, planes, stride=2, act_param=1e-3)
            else:
                self.conv1 = nn.Sequential(
-                    conv2d_ABN(inplanes, planes, stride=1, activation_param=1e-3),
+                    conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3),
-                    anti_alias_layer(channels=planes, filt_size=3, stride=2))
+                    aa_layer(channels=planes, filt_size=3, stride=2))
-        self.conv2 = conv2d_ABN(planes, planes, stride=1, activation="identity")
+        self.conv2 = conv2d_iabn(planes, planes, stride=1, act_layer="identity")
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
@ -148,24 +142,25 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
    expansion = 4
-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, anti_alias_layer=None):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True,
                 act_layer="leaky_relu", aa_layer=None):
        super(Bottleneck, self).__init__()
-        self.conv1 = conv2d_ABN(
+        self.conv1 = conv2d_iabn(
-            inplanes, planes, kernel_size=1, stride=1, activation="leaky_relu", activation_param=1e-3)
+            inplanes, planes, kernel_size=1, stride=1, act_layer=act_layer, act_param=1e-3)
        if stride == 1:
-            self.conv2 = conv2d_ABN(
+            self.conv2 = conv2d_iabn(
-                planes, planes, kernel_size=3, stride=1, activation="leaky_relu", activation_param=1e-3)
+                planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3)
        else:
-            if anti_alias_layer is None:
+            if aa_layer is None:
-                self.conv2 = conv2d_ABN(
+                self.conv2 = conv2d_iabn(
-                    planes, planes, kernel_size=3, stride=2, activation="leaky_relu", activation_param=1e-3)
+                    planes, planes, kernel_size=3, stride=2, act_layer=act_layer, act_param=1e-3)
            else:
                self.conv2 = nn.Sequential(
-                    conv2d_ABN(planes, planes, kernel_size=3, stride=1, activation="leaky_relu", activation_param=1e-3),
+                    conv2d_iabn(planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3),
-                    anti_alias_layer(channels=planes, filt_size=3, stride=2))
+                    aa_layer(channels=planes, filt_size=3, stride=2))
-        self.conv3 = conv2d_ABN(
+        self.conv3 = conv2d_iabn(
-            planes, planes * self.expansion, kernel_size=1, stride=1, activation="identity")
+            planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity")
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
@ -195,30 +190,26 @@ class Bottleneck(nn.Module):
 class TResNet(nn.Module):
    def __init__(self, layers, in_chans=3, num_classes=1000, width_factor=1.0, no_aa_jit=False,
                 global_pool='avg', drop_rate=0.):
        if not has_iabn:
            raise ImportError(
                "For TResNet models, please install InplaceABN: "
                "'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        super(TResNet, self).__init__()
        # JIT layers
        space_to_depth = SpaceToDepthModule()
-        anti_alias_layer = partial(AntiAliasDownsampleLayer, no_jit=no_aa_jit)
+        aa_layer = partial(AntiAliasDownsampleLayer, no_jit=no_aa_jit)
        # TResnet stages
        self.inplanes = int(64 * width_factor)
        self.planes = int(64 * width_factor)
-        conv1 = conv2d_ABN(in_chans * 16, self.planes, stride=1, kernel_size=3)
+        conv1 = conv2d_iabn(in_chans * 16, self.planes, stride=1, kernel_size=3)
        layer1 = self._make_layer(
-            BasicBlock, self.planes, layers[0], stride=1, use_se=True, anti_alias_layer=anti_alias_layer)  # 56x56
+            BasicBlock, self.planes, layers[0], stride=1, use_se=True, aa_layer=aa_layer)  # 56x56
        layer2 = self._make_layer(
-            BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, anti_alias_layer=anti_alias_layer)  # 28x28
+            BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, aa_layer=aa_layer)  # 28x28
        layer3 = self._make_layer(
-            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, anti_alias_layer=anti_alias_layer)  # 14x14
+            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, aa_layer=aa_layer)  # 14x14
        layer4 = self._make_layer(
-            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, anti_alias_layer=anti_alias_layer)  # 7x7
+            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, aa_layer=aa_layer)  # 7x7
        # body
        self.body = nn.Sequential(OrderedDict([
@ -239,7 +230,7 @@ class TResNet(nn.Module):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
-            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InPlaceABN):
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InplaceAbn):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
@ -251,24 +242,24 @@ class TResNet(nn.Module):
                m.conv3[1].weight = nn.Parameter(torch.zeros_like(m.conv3[1].weight))  # BN to zero
            if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01)
-    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, anti_alias_layer=None):
+    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, aa_layer=None):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            layers = []
            if stride == 2:
                # avg pooling before 1x1 conv
                layers.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True, count_include_pad=False))
-            layers += [conv2d_ABN(
+            layers += [conv2d_iabn(
-                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, activation="identity")]
+                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, act_layer="identity")]
            downsample = nn.Sequential(*layers)
        layers = []
        layers.append(block(
-            self.inplanes, planes, stride, downsample, use_se=use_se, anti_alias_layer=anti_alias_layer))
+            self.inplanes, planes, stride, downsample, use_se=use_se, aa_layer=aa_layer))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(
-                block(self.inplanes, planes, use_se=use_se, anti_alias_layer=anti_alias_layer))
+                block(self.inplanes, planes, use_se=use_se, aa_layer=aa_layer))
        return nn.Sequential(*layers)
    def get_classifier(self):
--- a/timm/models/vovnet.py
+++ b/timm/models/vovnet.py
@ -0,0 +1,408 @@
 """ VoVNet (V1 & V2)
 Papers:
 * `An Energy and GPU-Computation Efficient Backbone Network` - https://arxiv.org/abs/1904.09730
 * `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
 Looked at  https://github.com/youngwanLEE/vovnet-detectron2 &
 https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
 for some reference, rewrote most of the code.
 Hacked together by Ross Wightman
 """
 from typing import List
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .registry import register_model
 from .helpers import load_pretrained
 from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, SelectAdaptivePool2d, \
    create_attn, create_norm_act, get_norm_act_layer
 # model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 &
 # https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
 model_cfgs = dict(
    vovnet39a=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 1, 2, 2],
        residual=False,
        depthwise=False,
        attn='',
    ),
    vovnet57a=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 1, 4, 3],
        residual=False,
        depthwise=False,
        attn='',
    ),
    ese_vovnet19b_slim_dw=dict(
        stem_ch=[64, 64, 64],
        stage_conv_ch=[64, 80, 96, 112],
        stage_out_ch=[112, 256, 384, 512],
        layer_per_block=3,
        block_per_stage=[1, 1, 1, 1],
        residual=True,
        depthwise=True,
        attn='ese',
    ),
    ese_vovnet19b_dw=dict(
        stem_ch=[64, 64, 64],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=3,
        block_per_stage=[1, 1, 1, 1],
        residual=True,
        depthwise=True,
        attn='ese',
    ),
    ese_vovnet19b_slim=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[64, 80, 96, 112],
        stage_out_ch=[112, 256, 384, 512],
        layer_per_block=3,
        block_per_stage=[1, 1, 1, 1],
        residual=True,
        depthwise=False,
        attn='ese',
    ),
    ese_vovnet19b=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=3,
        block_per_stage=[1, 1, 1, 1],
        residual=True,
        depthwise=False,
        attn='ese',
    ),
    ese_vovnet39b=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 1, 2, 2],
        residual=True,
        depthwise=False,
        attn='ese',
    ),
    ese_vovnet57b=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 1, 4, 3],
        residual=True,
        depthwise=False,
        attn='ese',
    ),
    ese_vovnet99b=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 3, 9, 3],
        residual=True,
        depthwise=False,
        attn='ese',
    ),
    eca_vovnet39b=dict(
        stem_ch=[64, 64, 128],
        stage_conv_ch=[128, 160, 192, 224],
        stage_out_ch=[256, 512, 768, 1024],
        layer_per_block=5,
        block_per_stage=[1, 1, 2, 2],
        residual=True,
        depthwise=False,
        attn='eca',
    ),
 )
 def _cfg(url=''):
    return {
        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
        'crop_pct': 0.875, 'interpolation': 'bicubic',
        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
    }
 default_cfgs = dict(
    vovnet39a=_cfg(url=''),
    vovnet57a=_cfg(url=''),
    ese_vovnet19b_slim_dw=_cfg(url=''),
    ese_vovnet19b_dw=_cfg(url=''),
    ese_vovnet19b_slim=_cfg(url=''),
    ese_vovnet39b=_cfg(url=''),
    ese_vovnet57b=_cfg(url=''),
    ese_vovnet99b=_cfg(url=''),
    eca_vovnet39b=_cfg(url=''),
 )
 class SequentialAppendList(nn.Sequential):
    def __init__(self, *args):
        super(SequentialAppendList, self).__init__(*args)
    def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor:
        for i, module in enumerate(self):
            if i == 0:
                concat_list.append(module(x))
            else:
                concat_list.append(module(concat_list[-1]))
        x = torch.cat(concat_list, dim=1)
        return x
 class OsaBlock(nn.Module):
    def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
                 depthwise=False, attn='', norm_layer=BatchNormAct2d):
        super(OsaBlock, self).__init__()
        self.residual = residual
        self.depthwise = depthwise
        next_in_chs = in_chs
        if self.depthwise and next_in_chs != mid_chs:
            assert not residual
            self.conv_reduction = ConvBnAct(next_in_chs, mid_chs, 1, norm_layer=norm_layer)
        else:
            self.conv_reduction = None
        mid_convs = []
        for i in range(layer_per_block):
            if self.depthwise:
                conv = SeparableConvBnAct(mid_chs, mid_chs, norm_layer=norm_layer)
            else:
                conv = ConvBnAct(next_in_chs, mid_chs, 3, norm_layer=norm_layer)
            next_in_chs = mid_chs
            mid_convs.append(conv)
        self.conv_mid = SequentialAppendList(*mid_convs)
        # feature aggregation
        next_in_chs = in_chs + layer_per_block * mid_chs
        self.conv_concat = ConvBnAct(next_in_chs, out_chs, norm_layer=norm_layer)
        if attn:
            self.attn = create_attn(attn, out_chs)
        else:
            self.attn = None
    def forward(self, x):
        output = [x]
        if self.conv_reduction is not None:
            x = self.conv_reduction(x)
        x = self.conv_mid(x, output)
        x = self.conv_concat(x)
        if self.attn is not None:
            x = self.attn(x)
        if self.residual:
            x = x + output[0]
        return x
 class OsaStage(nn.Module):
    def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block,
                 downsample=True, residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d):
        super(OsaStage, self).__init__()
        if downsample:
            self.pool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
        else:
            self.pool = None
        blocks = []
        for i in range(block_per_stage):
            last_block = i == block_per_stage - 1
            blocks += [OsaBlock(
                in_chs if i == 0 else out_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0,
                depthwise=depthwise, attn=attn if last_block else '', norm_layer=norm_layer)
            ]
        self.blocks = nn.Sequential(*blocks)
    def forward(self, x):
        if self.pool is not None:
            x = self.pool(x)
        x = self.blocks(x)
        return x
 class ClassifierHead(nn.Module):
    """Head."""
    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.):
        super(ClassifierHead, self).__init__()
        self.drop_rate = drop_rate
        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
        if num_classes > 0:
            self.fc = nn.Linear(in_chs, num_classes, bias=True)
        else:
            self.fc = nn.Identity()
    def forward(self, x):
        x = self.global_pool(x).flatten(1)
        if self.drop_rate:
            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
        x = self.fc(x)
        return x
 class VovNet(nn.Module):
    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4,
                 norm_layer=BatchNormAct2d):
        """ VovNet (v2)
        """
        super(VovNet, self).__init__()
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        assert stem_stride in (4, 2)
        stem_ch = cfg["stem_ch"]
        stage_conv_ch = cfg["stage_conv_ch"]
        stage_out_ch = cfg["stage_out_ch"]
        block_per_stage = cfg["block_per_stage"]
        layer_per_block = cfg["layer_per_block"]
        # Stem module
        last_stem_stride = stem_stride // 2
        conv_type = SeparableConvBnAct if cfg["depthwise"] else ConvBnAct
        self.stem = nn.Sequential(*[
            ConvBnAct(in_chans, stem_ch[0], 3, stride=2, norm_layer=norm_layer),
            conv_type(stem_ch[0], stem_ch[1], 3, stride=1, norm_layer=norm_layer),
            conv_type(stem_ch[1], stem_ch[2], 3, stride=last_stem_stride, norm_layer=norm_layer),
        ])
        # OSA stages
        in_ch_list = stem_ch[-1:] + stage_out_ch[:-1]
        stage_args = dict(
            residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], norm_layer=norm_layer)
        stages = []
        for i in range(4):  # num_stages
            downsample = stem_stride == 2 or i > 0  # first stage has no stride/downsample if stem_stride is 4
            stages += [OsaStage(
                in_ch_list[i], stage_conv_ch[i], stage_out_ch[i], block_per_stage[i], layer_per_block,
                downsample=downsample, **stage_args)
            ]
            self.num_features = stage_out_ch[i]
        self.stages = nn.Sequential(*stages)
        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
        for n, m in self.named_modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1.)
                nn.init.constant_(m.bias, 0.)
    def get_classifier(self):
        return self.head.fc
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
    def forward_features(self, x):
        x = self.stem(x)
        return self.stages(x)
    def forward(self, x):
        x = self.forward_features(x)
        return self.head(x)
 def _vovnet(variant, pretrained=False, **kwargs):
    load_strict = True
    model_class = VovNet
    if kwargs.pop('features_only', False):
        assert False, 'Not Implemented'  # TODO
        load_strict = False
        kwargs.pop('num_classes', 0)
    model_cfg = model_cfgs[variant]
    default_cfg = default_cfgs[variant]
    model = model_class(model_cfg, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(
            model, default_cfg,
            num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3), strict=load_strict)
    return model
@register_model
 def vovnet39a(pretrained=False, **kwargs):
    return _vovnet('vovnet39a', pretrained=pretrained, **kwargs)
@register_model
 def vovnet57a(pretrained=False, **kwargs):
    return _vovnet('vovnet57a', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet19b_slim_dw(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet19b_slim_dw', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet19b_dw(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet19b_dw', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet19b_slim(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet19b_slim', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet39b(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet39b', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet57b(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet57b', pretrained=pretrained, **kwargs)
@register_model
 def ese_vovnet99b(pretrained=False, **kwargs):
    return _vovnet('ese_vovnet99b', pretrained=pretrained, **kwargs)
@register_model
 def eca_vovnet39b(pretrained=False, **kwargs):
    return _vovnet('eca_vovnet39b', pretrained=pretrained, **kwargs)
 # Experimental Models
@register_model
 def ese_vovnet39b_iabn(pretrained=False, **kwargs):
    norm_layer = get_norm_act_layer('iabn')
    return _vovnet('ese_vovnet39b', pretrained=pretrained, norm_layer=norm_layer, **kwargs)
@register_model
 def ese_vovnet39b_evos(pretrained=False, **kwargs):
    def norm_act_fn(num_features, **kwargs):
        return create_norm_act('EvoNormSample', num_features, jit=False, **kwargs)
    return _vovnet('ese_vovnet39b', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs)
--- a/validate.py
+++ b/validate.py
@ -24,7 +24,8 @@ try:
 except ImportError:
    has_apex = False
-from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models
+from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models,\
    set_scriptable, set_no_jit
 from timm.data import Dataset, DatasetTar, create_loader, resolve_data_config
 from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging
@ -84,6 +85,9 @@ def validate(args):
    args.pretrained = args.pretrained or not args.checkpoint
    args.prefetcher = not args.no_prefetcher
    if args.torchscript:
        set_scriptable(True)
    # create model
    model = create_model(
        args.model,
@ -141,8 +145,10 @@ def validate(args):
    top5 = AverageMeter()
    model.eval()
    end = time.time()
    with torch.no_grad():
        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
        model(torch.randn((args.batch_size,) + data_config['input_size']).cuda())
        end = time.time()
        for i, (input, target) in enumerate(loader):
            if args.no_prefetcher:
                target = target.cuda()