diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 47cffac8..bc3c871a 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -1,3 +1,4 @@
+from .csp import *
 from .densenet import *
 from .dla import *
 from .dpn import *
diff --git a/timm/models/csp.py b/timm/models/csp.py
new file mode 100644
index 00000000..6286418b
--- /dev/null
+++ b/timm/models/csp.py
@@ -0,0 +1,475 @@
+"""PyTorch CspNet
+
+A PyTorch implementation of Cross Stage Partial Networks including:
+* CSPResNet50
+* CSPResNeXt50
+* CSPDarkNet53
+* and DarkNet53 for good measure
+
+Based on paper `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
+
+Reference impl via darknet cfg files at https://github.com/WongKinYiu/CrossStagePartialNetworks
+
+Hacked together by Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .features import FeatureNet
+from .helpers import load_pretrained
+from .layers import SelectAdaptivePool2d, ConvBnAct, DropPath, create_attn, get_norm_act_layer
+from .registry import register_model
+
+
+__all__ = ['CspNet']  # model_registry will add each entrypoint fn to this
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'cspresnet50': _cfg(url=''),
+    'cspresnet50d': _cfg(url=''),
+    'cspresnet50w': _cfg(url=''),
+    'cspresnext50': _cfg(url=''),
+    'cspresnext50_iabn': _cfg(url=''),
+    'cspdarknet53': _cfg(url=''),
+    'cspdarknet53_iabn': _cfg(url=''),
+    'darknet53': _cfg(url=''),
+}
+
+
+model_cfgs = dict(
+    cspresnet50=dict(
+        stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
+        stage=dict(
+            out_chs=(128, 256, 512, 1024),
+            depth=(3, 3, 5, 2),
+            stride=(1,) + (2,) * 3,
+            exp_ratio=(2.,) * 4,
+            bottle_ratio=(0.5,) * 4,
+            block_ratio=(1.,) * 4,
+        )
+    ),
+    cspresnet50d=dict(
+        stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
+        stage=dict(
+            out_chs=(128, 256, 512, 1024),
+            depth=(3, 3, 5, 2),
+            stride=(1,) + (2,) * 3,
+            exp_ratio=(2.,) * 4,
+            bottle_ratio=(0.5,) * 4,
+            block_ratio=(1.,) * 4,
+        )
+    ),
+    cspresnet50w=dict(
+        stem=dict(out_chs=[32, 32, 64], kernel_size=3, stride=2, pool='max'),
+        stage=dict(
+            out_chs=(256, 512, 1024, 2048),
+            depth=(3, 3, 5, 2),
+            stride=(1,) + (2,) * 3,
+            exp_ratio=(1.,) * 4,
+            bottle_ratio=(0.25,) * 4,
+            block_ratio=(0.5,) * 4,
+        )
+    ),
+    cspresnext50=dict(
+        stem=dict(out_chs=64, kernel_size=7, stride=2, pool='max'),
+        stage=dict(
+            out_chs=(256, 512, 1024, 2048),
+            depth=(3, 3, 5, 2),
+            stride=(1,) + (2,) * 3,
+            groups=(32,) * 4,
+            exp_ratio=(1.,) * 4,
+            bottle_ratio=(1.,) * 4,
+            block_ratio=(0.5,) * 4,
+        )
+    ),
+    cspdarknet53=dict(
+        stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stage=dict(
+            out_chs=(64, 128, 256, 512, 1024),
+            depth=(1, 2, 8, 8, 4),
+            stride=(2,) * 5,
+            exp_ratio=(2.,) + (1.,) * 4,
+            bottle_ratio=(0.5,) + (1.0,) * 4,
+            block_ratio=(1.,) + (0.5,) * 4,
+            down_growth=True,
+        )
+    ),
+    darknet53=dict(
+        stem=dict(out_chs=32, kernel_size=3, stride=1, pool=''),
+        stage=dict(
+            out_chs=(64, 128, 256, 512, 1024),
+            depth=(1, 2, 8, 8, 4),
+            stride=(2,) * 5,
+            bottle_ratio=(0.5,) * 5,
+            block_ratio=(1.,) * 5,
+        )
+    )
+)
+
+
+def create_stem(
+        in_chans=3, out_chs=32, kernel_size=3, stride=2, pool='',
+        act_layer=None, norm_layer=None, aa_layer=None):
+    stem = nn.Sequential()
+    if not isinstance(out_chs, (tuple, list)):
+        out_chs = [out_chs]
+    assert len(out_chs)
+    in_c = in_chans
+    for i, out_c in enumerate(out_chs):
+        conv_name = f'conv{i + 1}'
+        stem.add_module(conv_name, ConvBnAct(
+            in_c, out_c, kernel_size, stride=stride if i == 0 else 1,
+            act_layer=act_layer, norm_layer=norm_layer))
+        in_c = out_c
+        last_conv = conv_name
+    if pool:
+        if aa_layer is not None:
+            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+            stem.add_module('aa', aa_layer(channels=in_c, stride=2))
+        else:
+            stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
+    return stem, dict(num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
+
+
+class ResBottleneck(nn.Module):
+    """ ResNe(X)t Bottleneck Block
+    """
+
+    def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.25, groups=1,
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_last=False,
+                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None):
+        super(ResBottleneck, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
+
+        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvBnAct(mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
+        self.attn2 = create_attn(attn_layer, channels=mid_chs) if not attn_last else None
+        self.conv3 = ConvBnAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs)
+        self.attn3 = create_attn(attn_layer, channels=out_chs) if attn_last else None
+        self.drop_path = drop_path
+        self.act3 = act_layer(inplace=True)
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.attn2 is not None:
+            x = self.attn2(x)
+        x = self.conv3(x)
+        if self.attn3 is not None:
+            x = self.attn3(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        x = x + shortcut
+        # FIXME partial shortcut needed if first block handled as per original, not used for my current impl
+        #x[:, :shortcut.size(1)] += shortcut
+        x = self.act3(x)
+        return x
+
+
+class DarkBlock(nn.Module):
+    """ DarkNet Block
+    """
+
+    def __init__(self, in_chs, out_chs, dilation=1, bottle_ratio=0.5, groups=1,
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None,
+                 drop_block=None, drop_path=None):
+        super(DarkBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, drop_block=drop_block)
+        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvBnAct(mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups, **ckwargs)
+        self.attn = create_attn(attn_layer, channels=out_chs)
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.conv2.bn.weight)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.attn is not None:
+            x = self.attn(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        x = x + shortcut
+        return x
+
+
+class CrossStage(nn.Module):
+    """Cross Stage."""
+    def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., exp_ratio=1.,
+                 groups=1, first_dilation=None, down_growth=False, block_dpr=None,
+                 block_fn=ResBottleneck, **block_kwargs):
+        super(CrossStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs  # grow downsample channels to output channels
+        exp_chs = int(round(out_chs * exp_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'))
+
+        if stride != 1 or first_dilation != dilation:
+            self.conv_down = ConvBnAct(
+                in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
+                aa_layer=block_kwargs.get('aa_layer', None), **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = None
+            prev_chs = in_chs
+
+        # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also,
+        # there is also special case for the first stage for some of the model that results in uneven split
+        # across the two paths. I did it this way for simplicity for now.
+        self.conv_exp = ConvBnAct(prev_chs, exp_chs, kernel_size=1, **conv_kwargs)
+        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
+            self.blocks.add_module(str(i), block_fn(
+                prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition_b = ConvBnAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvBnAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        x = self.conv_exp(x)
+        xs, xb = x.chunk(2, dim=1)
+        xb = self.blocks(xb)
+        out = self.conv_transition(torch.cat([xs, self.conv_transition_b(xb)], dim=1))
+        return out
+
+
+class DarkStage(nn.Module):
+    """DarkNet stage."""
+
+    def __init__(self, in_chs, out_chs, stride, dilation, depth, block_ratio=1., bottle_ratio=1., groups=1,
+                 first_dilation=None, block_fn=ResBottleneck, block_dpr=None, **block_kwargs):
+        super(DarkStage, self).__init__()
+        first_dilation = first_dilation or dilation
+
+        self.conv_down = ConvBnAct(
+            in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups,
+            act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer'),
+            aa_layer=block_kwargs.get('aa_layer', None))
+
+        prev_chs = out_chs
+        block_out_chs = int(round(out_chs * block_ratio))
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            drop_path = DropPath(block_dpr[i]) if block_dpr and block_dpr[i] else None
+            self.blocks.add_module(str(i), block_fn(
+                prev_chs, block_out_chs, dilation, bottle_ratio, groups, drop_path=drop_path, **block_kwargs))
+            prev_chs = block_out_chs
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.blocks(x)
+        return x
+
+
+class ClassifierHead(nn.Module):
+    """Head."""
+
+    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.):
+        super(ClassifierHead, self).__init__()
+        self.drop_rate = drop_rate
+        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
+        if num_classes > 0:
+            self.fc = nn.Linear(in_chs, num_classes, bias=True)
+        else:
+            self.fc = nn.Identity()
+
+    def forward(self, x):
+        x = self.global_pool(x).flatten(1)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
+        x = self.fc(x)
+        return x
+
+
+def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32, drop_path_rate=0.):
+    # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
+    num_stages = len(cfg['depth'])
+    if 'groups' not in cfg:
+        cfg['groups'] = (1,) * num_stages
+    if 'down_growth' in cfg and not isinstance(cfg['down_growth'], (list, tuple)):
+        cfg['down_growth'] = (cfg['down_growth'],) * num_stages
+    cfg['block_dpr'] = [None] * num_stages if not drop_path_rate else \
+        [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg['depth'])).split(cfg['depth'])]
+    stage_strides = []
+    stage_dilations = []
+    stage_first_dilations = []
+    dilation = 1
+    for cfg_stride in cfg['stride']:
+        stage_first_dilations.append(dilation)
+        if curr_stride >= output_stride:
+            dilation *= cfg_stride
+            stride = 1
+        else:
+            stride = cfg_stride
+            curr_stride *= stride
+        stage_strides.append(stride)
+        stage_dilations.append(dilation)
+    cfg['stride'] = stage_strides
+    cfg['dilation'] = stage_dilations
+    cfg['first_dilation'] = stage_first_dilations
+    stage_args = [dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())]
+    return stage_args
+
+
+class CspNet(nn.Module):
+    """Cross Stage Partial base model.
+
+    Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929
+    Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks
+
+    NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the
+    darknet impl. I did it this way for simplicity and less special cases.
+    """
+
+    def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg', drop_rate=0.,
+                 act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_path_rate=0.,
+                 zero_init_last_bn=True, stage_fn=CrossStage, block_fn=ResBottleneck):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer)
+
+        # Construct the stem
+        self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'], **layer_args)
+        self.feature_info = [stem_feat_info]
+        prev_chs = stem_feat_info['num_chs']
+        curr_stride = stem_feat_info['reduction']  # reduction does not include pool
+        if cfg['stem']['pool']:
+            curr_stride *= 2
+
+        # Construct the stages
+        per_stage_args = _cfg_to_stage_args(
+            cfg['stage'], curr_stride=curr_stride, output_stride=output_stride, drop_path_rate=drop_path_rate)
+        self.stages = nn.Sequential()
+        for i, sa in enumerate(per_stage_args):
+            self.stages.add_module(
+                str(i), stage_fn(prev_chs, **sa, **layer_args, block_fn=block_fn))
+            prev_chs = sa['out_chs']
+            curr_stride *= sa['stride']
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+
+        # Construct the head
+        self.num_features = prev_chs
+        self.head = ClassifierHead(
+            in_chs=prev_chs, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, mean=0.0, std=0.01)
+                nn.init.zeros_(m.bias)
+        if zero_init_last_bn:
+            for m in self.modules():
+                if hasattr(m, 'zero_init_last_bn'):
+                    m.zero_init_last_bn()
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _cspnet(variant, pretrained=False, **kwargs):
+    features = False
+    out_indices = None
+    if kwargs.pop('features_only', False):
+        features = True
+        out_indices = kwargs.pop('out_indices', (0, 1, 2, 3, 4))
+    cfg_variant = variant.split('_')[0]
+    cfg = model_cfgs[cfg_variant]
+    model = CspNet(cfg, **kwargs)
+    model.default_cfg = default_cfgs[variant]
+    if pretrained:
+        load_pretrained(
+            model,
+            num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3), strict=not features)
+    if features:
+        model = FeatureNet(model, out_indices, flatten_sequential=True)
+    return model
+
+
+@register_model
+def cspresnet50(pretrained=False, **kwargs):
+    return _cspnet('cspresnet50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnet50d(pretrained=False, **kwargs):
+    return _cspnet('cspresnet50d', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnet50w(pretrained=False, **kwargs):
+    return _cspnet('cspresnet50w', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnext50(pretrained=False, **kwargs):
+    return _cspnet('cspresnext50', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def cspresnext50_iabn(pretrained=False, **kwargs):
+    norm_layer = get_norm_act_layer('iabn')
+    return _cspnet('cspresnext50', pretrained=pretrained, norm_layer=norm_layer, **kwargs)
+
+
+@register_model
+def cspdarknet53(pretrained=False, **kwargs):
+    return _cspnet('cspdarknet53', pretrained=pretrained, block_fn=DarkBlock, **kwargs)
+
+
+@register_model
+def cspdarknet53_iabn(pretrained=False, **kwargs):
+    norm_layer = get_norm_act_layer('iabn')
+    return _cspnet('cspdarknet53', pretrained=pretrained, block_fn=DarkBlock, norm_layer=norm_layer, **kwargs)
+
+
+@register_model
+def darknet53(pretrained=False, **kwargs):
+    return _cspnet('darknet53', pretrained=pretrained, block_fn=DarkBlock, stage_fn=DarkStage, **kwargs)
diff --git a/timm/models/layers/create_norm_act.py b/timm/models/layers/create_norm_act.py
index 7bdaa125..8fd500a3 100644
--- a/timm/models/layers/create_norm_act.py
+++ b/timm/models/layers/create_norm_act.py
@@ -9,7 +9,7 @@ from .norm_act import BatchNormAct2d, GroupNormAct
 from .inplace_abn import InplaceAbn
 
 _NORM_ACT_TYPES = {BatchNormAct2d, GroupNormAct, EvoNormBatch2d, EvoNormSample2d, InplaceAbn}
-
+_NORM_ACT_REQUIRES_ARG = {BatchNormAct2d, GroupNormAct, InplaceAbn}  # requires act_layer arg to define act type
 
 def get_norm_act_layer(layer_class):
     layer_class = layer_class.replace('_', '').lower()
@@ -58,7 +58,9 @@ def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None):
             norm_act_layer = GroupNormAct
         else:
             assert False, f"No equivalent norm_act layer for {type_name}"
+    if norm_act_layer in _NORM_ACT_REQUIRES_ARG:
         # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
-        # Newer models will use `apply_act` and likely have `act_layer` arg bound to relevant NormAct types.
+        # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types
+        # It is intended that functions/partial does not trigger this, they should define act.
         norm_act_args.update(dict(act_layer=act_layer))
     return norm_act_layer, norm_act_args
diff --git a/timm/models/layers/inplace_abn.py b/timm/models/layers/inplace_abn.py
index d78079db..c7edac62 100644
--- a/timm/models/layers/inplace_abn.py
+++ b/timm/models/layers/inplace_abn.py
@@ -38,7 +38,7 @@ class InplaceAbn(nn.Module):
     """
 
     def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
-                 act_layer="leaky_relu", act_param=0.01, drop_block=None,):
+                 act_layer="leaky_relu", act_param=0.01, drop_block=None):
         super(InplaceAbn, self).__init__()
         self.num_features = num_features
         self.affine = affine
@@ -46,14 +46,16 @@ class InplaceAbn(nn.Module):
         self.momentum = momentum
         if apply_act:
             if isinstance(act_layer, str):
-                assert act_layer in ('leaky_relu', 'elu', 'identity')
-                self.act_name = act_layer
+                assert act_layer in ('leaky_relu', 'elu', 'identity', '')
+                self.act_name = act_layer if act_layer else 'identity'
             else:
                 # convert act layer passed as type to string
-                if isinstance(act_layer, nn.ELU):
+                if act_layer == nn.ELU:
                     self.act_name = 'elu'
-                elif isinstance(act_layer, nn.LeakyReLU):
+                elif act_layer == nn.LeakyReLU:
                     self.act_name = 'leaky_relu'
+                elif act_layer == nn.Identity:
+                    self.act_name = 'identity'
                 else:
                     assert False, f'Invalid act layer {act_layer.__name__} for IABN'
         else:
diff --git a/timm/models/layers/se.py b/timm/models/layers/se.py
index 4e47cc96..578ebf08 100644
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@@ -28,10 +28,10 @@ class EffectiveSEModule(nn.Module):
     """ 'Effective Squeeze-Excitation
     From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
     """
-    def __init__(self, channel, gate_fn='hard_sigmoid'):
+    def __init__(self, channels, gate_fn='hard_sigmoid'):
         super(EffectiveSEModule, self).__init__()
         self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
         self.gate_fn = get_act_fn(gate_fn)
 
     def forward(self, x):