Merge remote-tracking branch 'origin/attn_update' into bits_and_tpu

3 years ago · c2f02b08b8
parent f2e14685a8 5bd04714e4
commit c2f02b08b8
22 changed files with 891 additions and 646 deletions
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@ -267,7 +267,9 @@ def _build_params_dict_single(weight, bias, **kwargs):
    return [dict(params=bias, **kwargs)]
-@pytest.mark.parametrize('optimizer', ['sgd', 'momentum'])
+#@pytest.mark.parametrize('optimizer', ['sgd', 'momentum'])
 # FIXME momentum variant frequently fails in GitHub runner, but never local after many attempts
@pytest.mark.parametrize('optimizer', ['sgd'])
 def test_sgd(optimizer):
    _test_basic_cases(
        lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3)
@ -320,7 +322,7 @@ def test_sgd(optimizer):
        lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3, momentum=1)
    )
    _test_basic_cases(
-        lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3, momentum=1, weight_decay=1)
+        lambda weight, bias: create_optimizer_v2([weight, bias], optimizer, lr=1e-3, momentum=1, weight_decay=.1)
    )
    _test_rosenbrock(
        lambda params: create_optimizer_v2(params, optimizer, lr=1e-3)
--- a/timm/data/distributed_sampler.py
+++ b/timm/data/distributed_sampler.py
@ -49,3 +49,80 @@ class OrderedDistributedSampler(Sampler):
    def __len__(self):
        return self.num_samples
 class RepeatAugSampler(Sampler):
    """Sampler that restricts data loading to a subset of the dataset for distributed,
    with repeated augmentation.
    It ensures that different each augmented version of a sample will be visible to a
    different process (GPU). Heavily based on torch.utils.data.DistributedSampler
    This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py
    Used in
    Copyright (c) 2015-present, Facebook, Inc.
    """
    def __init__(
            self,
            dataset,
            num_replicas=None,
            rank=None,
            shuffle=True,
            num_repeats=3,
            selected_round=256,
            selected_ratio=0,
    ):
        if num_replicas is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            num_replicas = dist.get_world_size()
        if rank is None:
            if not dist.is_available():
                raise RuntimeError("Requires distributed package to be available")
            rank = dist.get_rank()
        self.dataset = dataset
        self.num_replicas = num_replicas
        self.rank = rank
        self.shuffle = shuffle
        self.num_repeats = num_repeats
        self.epoch = 0
        self.num_samples = int(math.ceil(len(self.dataset) * num_repeats / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
        # Determine the number of samples to select per epoch for each rank.
        # num_selected logic defaults to be the same as original RASampler impl, but this one can be tweaked
        # via selected_ratio and selected_round args.
        selected_ratio = selected_ratio or num_replicas  # ratio to reduce selected samples by, num_replicas if 0
        if selected_round:
            self.num_selected_samples = int(math.floor(
                 len(self.dataset) // selected_round * selected_round / selected_ratio))
        else:
            self.num_selected_samples = int(math.ceil(len(self.dataset) / selected_ratio))
    def __iter__(self):
        # deterministically shuffle based on epoch
        g = torch.Generator()
        g.manual_seed(self.epoch)
        if self.shuffle:
            indices = torch.randperm(len(self.dataset), generator=g).tolist()
        else:
            indices = list(range(len(self.dataset)))
        # produce repeats e.g. [0, 0, 0, 1, 1, 1, 2, 2, 2....]
        indices = [x for x in indices for _ in range(self.num_repeats)]
        # add extra samples to make it evenly divisible
        padding_size = self.total_size - len(indices)
        indices += indices[:padding_size]
        assert len(indices) == self.total_size
        # subsample per rank
        indices = indices[self.rank:self.total_size:self.num_replicas]
        assert len(indices) == self.num_samples
        # return up to num selected samples
        return iter(indices[:self.num_selected_samples])
    def __len__(self):
        return self.num_selected_samples
    def set_epoch(self, epoch):
        self.epoch = epoch
--- a/timm/loss/init.py
+++ b/timm/loss/init.py
@ -1,3 +1,4 @@
 from .asymmetric_loss import AsymmetricLossMultiLabel, AsymmetricLossSingleLabel
 from .binary_cross_entropy import DenseBinaryCrossEntropy
 from .cross_entropy import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy
 from .jsd import JsdCrossEntropy
 from .asymmetric_loss import AsymmetricLossMultiLabel, AsymmetricLossSingleLabel
--- a/timm/loss/binary_cross_entropy.py
+++ b/timm/loss/binary_cross_entropy.py
@ -0,0 +1,23 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class DenseBinaryCrossEntropy(nn.Module):
    """ BCE using one-hot from dense targets w/ label smoothing
    NOTE for experiments comparing CE to BCE /w label smoothing, may remove
    """
    def __init__(self, smoothing=0.1):
        super(DenseBinaryCrossEntropy, self).__init__()
        assert 0. <= smoothing < 1.0
        self.smoothing = smoothing
        self.bce = nn.BCEWithLogitsLoss()
    def forward(self, x, target):
        num_classes = x.shape[-1]
        off_value = self.smoothing / num_classes
        on_value = 1. - self.smoothing + off_value
        target = target.long().view(-1, 1)
        target = torch.full(
            (target.size()[0], num_classes), off_value, device=x.device, dtype=x.dtype).scatter_(1, target, on_value)
        return self.bce(x, target)
--- a/timm/models/byoanet.py
+++ b/timm/models/byoanet.py
@ -33,73 +33,91 @@ def _cfg(url='', **kwargs):
 default_cfgs = {
    # GPU-Efficient (ResNet) weights
-    'botnet26t_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+    'botnet26t_256': _cfg(
-    'botnet50ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+        url='',
-    'eca_botnext26ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
+        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'botnet50t_256': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/botnet50t_256-a0e6c3b1.pth',
        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'eca_botnext26ts_256': _cfg(
        url='',
        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'eca_botnext50ts_256': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_botnext26ts_256-fb3bf984.pth',
        fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'halonet_h1': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
-    'halonet_h1_c4c5': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
+    'halonet26t': _cfg(
-    'halonet26t': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halonet26t_256-9b4bf0b3.pth',
        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
    'sehalonet33ts': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
    'halonet50ts': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
-    'eca_halonext26ts': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
+    'eca_halonext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_halonext26ts_256-1e55880b.pth',
        input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)),
-    'lambda_resnet26t': _cfg(url='', min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8)),
+    'lambda_resnet26t': _cfg(
-    'lambda_resnet50t': _cfg(url='', min_input_size=(3, 128, 128)),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet26t_256-b040fce6.pth',
-    'eca_lambda_resnext26ts': _cfg(url='', min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8)),
+        min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8)),
    'swinnet26t_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'swinnet50ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'eca_swinnext26ts_256': _cfg(url='', fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)),
    'rednet26t': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
    'rednet50ts': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)),
 }
 model_cfgs = dict(
    botnet26t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        fixed_input_size=True,
        self_attn_layer='bottleneck',
        self_attn_kwargs=dict()
    ),
    botnet50t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        self_attn_layer='bottleneck',
        self_attn_kwargs=dict()
    ),
-    botnet50ts=ByoModelCfg(
+    eca_botnext26ts=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='bottle', d=3, c=256, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
-            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25),
-            ByoBlockCfg(type='self_attn', d=3, c=2048, s=1, gs=0, br=0.25),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
-        stem_pool='',
+        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        act_layer='silu',
        attn_layer='eca',
        self_attn_layer='bottleneck',
        self_attn_kwargs=dict()
    ),
-    eca_botnext26ts=ByoModelCfg(
+    eca_botnext50ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=16, br=0.25),
            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=16, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25),
            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=16, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        act_layer='silu',
        attn_layer='eca',
@ -117,193 +135,83 @@ model_cfgs = dict(
        stem_chs=64,
        stem_type='7x7',
        stem_pool='maxpool',
-        num_features=0,
+
        self_attn_layer='halo',
        self_attn_kwargs=dict(block_size=8, halo_size=3),
    ),
-    halonet_h1_c4c5=ByoModelCfg(
+    halonet26t=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=0, br=1.0),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=3, c=128, s=2, gs=0, br=1.0),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
-            ByoBlockCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
-            ByoBlockCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0),
+            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=3),
+        self_attn_kwargs=dict(block_size=8, halo_size=2, dim_head=16)
    ),
-    halonet26t=ByoModelCfg(
+    sehalonet33ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=512, s=2, gs=0, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=1024, s=2, gs=0, br=0.25),
-            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
+            ByoBlockCfg('self_attn', d=2, c=1536, s=2, gs=0, br=0.333),
        ),
        stem_chs=64,
        stem_type='tiered',
-        stem_pool='maxpool',
+        stem_pool='',
-        num_features=0,
+        act_layer='silu',
        num_features=1280,
        attn_layer='se',
        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=2)  # intended for 256x256 res
+        self_attn_kwargs=dict(block_size=8, halo_size=3)
    ),
    halonet50ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
+            interleave_blocks(
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=6, c=1024, s=2, gs=0, br=0.25),
+                types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25,
-            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
+                self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=3, num_heads=4)),
            interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        act_layer='silu',
        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=2)
+        self_attn_kwargs=dict(block_size=8, halo_size=3)
    ),
    eca_halonext26ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        act_layer='silu',
        attn_layer='eca',
        self_attn_layer='halo',
-        self_attn_kwargs=dict(block_size=8, halo_size=2)  # intended for 256x256 res
+        self_attn_kwargs=dict(block_size=8, halo_size=2, dim_head=16)
    ),
    lambda_resnet26t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25),
-            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
+            interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        self_attn_layer='lambda',
        self_attn_kwargs=dict()
    ),
    lambda_resnet50t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=3, d=6, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        self_attn_layer='lambda',
        self_attn_kwargs=dict()
    ),
    eca_lambda_resnext26ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        act_layer='silu',
        attn_layer='eca',
        self_attn_layer='lambda',
-        self_attn_kwargs=dict()
+        self_attn_kwargs=dict(r=9)
    ),
    swinnet26t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        self_attn_layer='swin',
        self_attn_kwargs=dict(win_size=8)
    ),
    swinnet50ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=4, c=512, s=2, gs=0, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        act_layer='silu',
        self_attn_layer='swin',
        self_attn_kwargs=dict(win_size=8)
    ),
    eca_swinnext26ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=512, s=2, gs=16, br=0.25),
            interleave_blocks(types=('bottle', 'self_attn'), every=1, d=2, c=1024, s=2, gs=16, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        fixed_input_size=True,
        act_layer='silu',
        attn_layer='eca',
        self_attn_layer='swin',
        self_attn_kwargs=dict(win_size=8)
    ),
    rednet26t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='self_attn', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=512, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',  # FIXME RedNet uses involution in middle of stem
        stem_pool='maxpool',
        num_features=0,
        self_attn_layer='involution',
        self_attn_kwargs=dict()
    ),
    rednet50ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='self_attn', d=3, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=4, c=512, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=2, c=1024, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='self_attn', d=3, c=2048, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        num_features=0,
        act_layer='silu',
        self_attn_layer='involution',
        self_attn_kwargs=dict()
    ),
 )
@ -319,119 +227,76 @@ def _create_byoanet(variant, cfg_variant=None, pretrained=False, **kwargs):
@register_model
 def botnet26t_256(pretrained=False, **kwargs):
-    """ Bottleneck Transformer w/ ResNet26-T backbone. Bottleneck attn in final stage.
+    """ Bottleneck Transformer w/ ResNet26-T backbone. Bottleneck attn in final two stages.
    FIXME 26t variant was mixed up with 50t arch cfg, retraining and determining why so low
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('botnet26t_256', 'botnet26t', pretrained=pretrained, **kwargs)
@register_model
-def botnet50ts_256(pretrained=False, **kwargs):
+def botnet50t_256(pretrained=False, **kwargs):
-    """ Bottleneck Transformer w/ ResNet50-T backbone. Bottleneck attn in final stage.
+    """ Bottleneck Transformer w/ ResNet50-T backbone. Bottleneck attn in final two stages.
    """
    kwargs.setdefault('img_size', 256)
-    return _create_byoanet('botnet50ts_256', 'botnet50ts', pretrained=pretrained, **kwargs)
+    return _create_byoanet('botnet50t_256', 'botnet50t', pretrained=pretrained, **kwargs)
@register_model
 def eca_botnext26ts_256(pretrained=False, **kwargs):
-    """ Bottleneck Transformer w/ ResNet26-T backbone. Bottleneck attn in final stage.
+    """ Bottleneck Transformer w/ ResNet26-T backbone, silu act, Bottleneck attn in final two stages.
    FIXME 26ts variant was mixed up with 50ts arch cfg, retraining and determining why so low
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('eca_botnext26ts_256', 'eca_botnext26ts', pretrained=pretrained, **kwargs)
@register_model
 def eca_botnext50ts_256(pretrained=False, **kwargs):
    """ Bottleneck Transformer w/ ResNet26-T backbone, silu act, Bottleneck attn in final two stages.
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('eca_botnext50ts_256', 'eca_botnext50ts', pretrained=pretrained, **kwargs)
@register_model
 def halonet_h1(pretrained=False, **kwargs):
    """ HaloNet-H1. Halo attention in all stages as per the paper.
-
+    NOTE: This runs very slowly!
    This runs very slowly, param count lower than paper --> something is wrong.
    """
    return _create_byoanet('halonet_h1', pretrained=pretrained, **kwargs)
@register_model
-def halonet_h1_c4c5(pretrained=False, **kwargs):
+def halonet26t(pretrained=False, **kwargs):
-    """ HaloNet-H1 config w/ attention in last two stages.
+    """ HaloNet w/ a ResNet26-t backbone. Halo attention in final two stages
    """
-    return _create_byoanet('halonet_h1_c4c5', pretrained=pretrained, **kwargs)
+    return _create_byoanet('halonet26t', pretrained=pretrained, **kwargs)
@register_model
-def halonet26t(pretrained=False, **kwargs):
+def sehalonet33ts(pretrained=False, **kwargs):
-    """ HaloNet w/ a ResNet26-t backbone, Hallo attention in final stage
+    """ HaloNet w/ a ResNet33-t backbone, SE attn for non Halo blocks, SiLU, 1-2 Halo in stage 2,3,4.
    """
-    return _create_byoanet('halonet26t', pretrained=pretrained, **kwargs)
+    return _create_byoanet('sehalonet33ts', pretrained=pretrained, **kwargs)
@register_model
 def halonet50ts(pretrained=False, **kwargs):
-    """ HaloNet w/ a ResNet50-t backbone, Hallo attention in final stage
+    """ HaloNet w/ a ResNet50-t backbone, silu act. Halo attention in final two stages
    """
    return _create_byoanet('halonet50ts', pretrained=pretrained, **kwargs)
@register_model
 def eca_halonext26ts(pretrained=False, **kwargs):
-    """ HaloNet w/ a ResNet26-t backbone, Hallo attention in final stage
+    """ HaloNet w/ a ResNet26-t backbone, silu act. Halo attention in final two stages
    """
    return _create_byoanet('eca_halonext26ts', pretrained=pretrained, **kwargs)
@register_model
 def lambda_resnet26t(pretrained=False, **kwargs):
-    """ Lambda-ResNet-26T. Lambda layers in one C4 stage and all C5.
+    """ Lambda-ResNet-26T. Lambda layers in last two stages.
    """
    return _create_byoanet('lambda_resnet26t', pretrained=pretrained, **kwargs)
@register_model
 def lambda_resnet50t(pretrained=False, **kwargs):
    """ Lambda-ResNet-50T. Lambda layers in one C4 stage and all C5.
    """
    return _create_byoanet('lambda_resnet50t', pretrained=pretrained, **kwargs)
@register_model
 def eca_lambda_resnext26ts(pretrained=False, **kwargs):
    """ Lambda-ResNet-26T. Lambda layers in one C4 stage and all C5.
    """
    return _create_byoanet('eca_lambda_resnext26ts', pretrained=pretrained, **kwargs)
@register_model
 def swinnet26t_256(pretrained=False, **kwargs):
    """
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('swinnet26t_256', 'swinnet26t', pretrained=pretrained, **kwargs)
@register_model
 def swinnet50ts_256(pretrained=False, **kwargs):
    """
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('swinnet50ts_256', 'swinnet50ts', pretrained=pretrained, **kwargs)
@register_model
 def eca_swinnext26ts_256(pretrained=False, **kwargs):
    """
    """
    kwargs.setdefault('img_size', 256)
    return _create_byoanet('eca_swinnext26ts_256', 'eca_swinnext26ts', pretrained=pretrained, **kwargs)
@register_model
 def rednet26t(pretrained=False, **kwargs):
    """
    """
    return _create_byoanet('rednet26t', pretrained=pretrained, **kwargs)
@register_model
 def rednet50ts(pretrained=False, **kwargs):
    """
    """
    return _create_byoanet('rednet50ts', pretrained=pretrained, **kwargs)
--- a/timm/models/byobnet.py
+++ b/timm/models/byobnet.py
@ -33,7 +33,7 @@ import torch
 import torch.nn as nn
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .helpers import build_model_with_cfg
+from .helpers import build_model_with_cfg, named_apply
 from .layers import ClassifierHead, ConvBnAct, BatchNormAct2d, DropPath, AvgPool2dSame, \
    create_conv2d, get_act_layer, convert_norm_act, get_attn, make_divisible, to_2tuple
 from .registry import register_model
@ -93,19 +93,50 @@ default_cfgs = {
        first_conv='stem.conv1', input_size=(3, 256, 256), pool_size=(8, 8),
        test_input_size=(3, 288, 288), crop_pct=1.0),
    'resnet61q': _cfg(
-        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet61q_ra2-6afc536c.pth',
-    'geresnet50t': _cfg(
+        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8),
-        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
+        test_input_size=(3, 288, 288), crop_pct=1.0, interpolation='bicubic'),
    'gcresnet50t': _cfg(
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'resnext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnext26ts_256-df727fca.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'gcresnext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext26ts_256-e414378b.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'seresnext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnext26ts_256-6f0d74a3.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
-    'gcresnet26ts': _cfg(
+    'eca_resnext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnext26ts_256-5a1d030f.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'bat_resnext26ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/bat_resnext26ts_256-fa6fd595.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic',
        min_input_size=(3, 256, 256)),
    'resnet32ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet32ts_256-aacf5250.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'resnet33ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet33ts_256-0e0cd345.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'gcresnet33ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet33ts_256-0e0cd345.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'seresnet33ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnet33ts_256-f8ad44d9.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'eca_resnet33ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnet33ts_256-8f98face.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'gcresnet50t': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet50t_256-96374d1c.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
    'gcresnext50ts': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext50ts_256-3e0f515e.pth',
        first_conv='stem.conv1.conv', input_size=(3, 256, 256), pool_size=(8, 8), interpolation='bicubic'),
 }
@ -135,7 +166,7 @@ class ByoModelCfg:
    stem_chs: int = 32
    width_factor: float = 1.0
    num_features: int = 0  # num out_channels for final conv, no final 1x1 conv if 0
-    zero_init_last_bn: bool = True
+    zero_init_last: bool = True  # zero init last weight (usually bn) in residual path
    fixed_input_size: bool = False  # model constrained to a fixed-input size / img_size must be provided on creation
    act_layer: str = 'relu'
@ -159,13 +190,13 @@ def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0):
 def interleave_blocks(
-        types: Tuple[str, str], every: Union[int, List[int]], d, first: bool = False, **kwargs
+        types: Tuple[str, str], d, every: Union[int, List[int]] = 1, first: bool = False, **kwargs
 ) -> Tuple[ByoBlockCfg]:
    """ interleave 2 block types in stack
    """
    assert len(types) == 2
    if isinstance(every, int):
-        every = list(range(0 if first else every, d, every))
+        every = list(range(0 if first else every, d, every + 1))
        if not every:
            every = [d - 1]
    set(every)
@ -255,7 +286,8 @@ model_cfgs = dict(
        stem_chs=64,
    ),
-    # WARN: experimental, may vanish/change
+    # 4 x conv stem w/ 2 act, no maxpool, 2,4,6,4 repeats, group size 32 in first 3 blocks
    # DW convs in last block, 2048 pre-FC, silu act  
    resnet51q=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
@ -270,6 +302,8 @@ model_cfgs = dict(
        act_layer='silu',
    ),
    # 4 x conv stem w/ 4 act, no maxpool, 1,4,6,4 repeats, edge block first, group size 32 in next 2 blocks
    # DW convs in last block, 4 conv for each bottle block, 2048 pre-FC, silu act  
    resnet61q=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='edge', d=1, c=256, s=1, gs=0, br=1.0, block_kwargs=dict()),
@ -285,53 +319,91 @@ model_cfgs = dict(
        block_kwargs=dict(extra_conv=True),
    ),
-    # WARN: experimental, may vanish/change
+    # A series of ResNeXt-26 models w/ one of none, GC, SE, ECA, BAT attn, group size 32, SiLU act,
-    geresnet50t=ByoModelCfg(
+    # and a tiered stem w/ maxpool
    resnext26ts=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='edge', d=3, c=256, s=1, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
-            ByoBlockCfg(type='edge', d=4, c=512, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
-        stem_pool=None,
+        stem_pool='maxpool',
-        attn_layer='ge',
+        act_layer='silu',
        attn_kwargs=dict(extent=8, extra_params=True),
        #attn_kwargs=dict(extent=8),
        #block_kwargs=dict(attn_last=True)
    ),
-
+    gcresnext26ts=ByoModelCfg(
    # WARN: experimental, may vanish/change
    gcresnet50t=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='bottle', d=3, c=256, s=1, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=4, c=512, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
-        stem_pool=None,
+        stem_pool='maxpool',
-        attn_layer='gc'
+        act_layer='silu',
        attn_layer='gca',
    ),
-
+    seresnext26ts=ByoModelCfg(
    gcresnext26ts=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
-            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        act_layer='silu',
        attn_layer='se',
    ),
    eca_resnext26ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        act_layer='silu',
        attn_layer='eca',
    ),
    bat_resnext26ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
        act_layer='silu',
        attn_layer='bat',
        attn_kwargs=dict(block_size=8)
    ),
    # ResNet-32 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, no pre-fc feat layer, tiered stem w/o maxpool
    resnet32ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='',
        num_features=0,
        act_layer='silu',
        attn_layer='gc',
    ),
-    gcresnet26ts=ByoModelCfg(
+    # ResNet-33 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, 1280 pre-FC feat, tiered stem w/o maxpool
    resnet33ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
@ -343,23 +415,79 @@ model_cfgs = dict(
        stem_pool='',
        num_features=1280,
        act_layer='silu',
        attn_layer='gc',
    ),
-    bat_resnext26ts=ByoModelCfg(
+    # A series of ResNet-33 (2, 3, 3, 2) models w/ one of GC, SE, ECA attn, no groups, SiLU act, 1280 pre-FC feat 
    # and a tiered stem w/ no maxpool
    gcresnet33ts=ByoModelCfg(
        blocks=(
-            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
-            ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25),
+            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='',
        num_features=1280,
        act_layer='silu',
        attn_layer='gca',
    ),
    seresnet33ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='',
        num_features=1280,
        act_layer='silu',
        attn_layer='se',
    ),
    eca_resnet33ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25),
            ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='',
        num_features=1280,
        act_layer='silu',
        attn_layer='eca',
    ),
    gcresnet50t=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, br=0.25),
            ByoBlockCfg(type='bottle', d=4, c=512, s=2, br=0.25),
            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='',
        attn_layer='gca',
    ),
    gcresnext50ts=ByoModelCfg(
        blocks=(
            ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=6, c=1024, s=2, gs=32, br=0.25),
            ByoBlockCfg(type='bottle', d=3, c=2048, s=2, gs=32, br=0.25),
        ),
        stem_chs=64,
        stem_type='tiered',
        stem_pool='maxpool',
-        num_features=0,
+        # stem_pool=None,
        act_layer='silu',
-        attn_layer='bat',
+        attn_layer='gca',
        attn_kwargs=dict(block_size=8)
    ),
 )
@ -467,31 +595,31 @@ def resnet61q(pretrained=False, **kwargs):
@register_model
-def geresnet50t(pretrained=False, **kwargs):
+def resnext26ts(pretrained=False, **kwargs):
    """
    """
-    return _create_byobnet('geresnet50t', pretrained=pretrained, **kwargs)
+    return _create_byobnet('resnext26ts', pretrained=pretrained, **kwargs)
@register_model
-def gcresnet50t(pretrained=False, **kwargs):
+def gcresnext26ts(pretrained=False, **kwargs):
    """
    """
-    return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs)
+    return _create_byobnet('gcresnext26ts', pretrained=pretrained, **kwargs)
@register_model
-def gcresnext26ts(pretrained=False, **kwargs):
+def seresnext26ts(pretrained=False, **kwargs):
    """
    """
-    return _create_byobnet('gcresnext26ts', pretrained=pretrained, **kwargs)
+    return _create_byobnet('seresnext26ts', pretrained=pretrained, **kwargs)
@register_model
-def gcresnet26ts(pretrained=False, **kwargs):
+def eca_resnext26ts(pretrained=False, **kwargs):
    """
    """
-    return _create_byobnet('gcresnet26ts', pretrained=pretrained, **kwargs)
+    return _create_byobnet('eca_resnext26ts', pretrained=pretrained, **kwargs)
@register_model
@ -501,6 +629,55 @@ def bat_resnext26ts(pretrained=False, **kwargs):
    return _create_byobnet('bat_resnext26ts', pretrained=pretrained, **kwargs)
@register_model
 def resnet32ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('resnet32ts', pretrained=pretrained, **kwargs)
@register_model
 def resnet33ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('resnet33ts', pretrained=pretrained, **kwargs)
@register_model
 def gcresnet33ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('gcresnet33ts', pretrained=pretrained, **kwargs)
@register_model
 def seresnet33ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('seresnet33ts', pretrained=pretrained, **kwargs)
@register_model
 def eca_resnet33ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('eca_resnet33ts', pretrained=pretrained, **kwargs)
@register_model
 def gcresnet50t(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs)
@register_model
 def gcresnext50ts(pretrained=False, **kwargs):
    """
    """
    return _create_byobnet('gcresnext50ts', pretrained=pretrained, **kwargs)
 def expand_blocks_cfg(stage_blocks_cfg: Union[ByoBlockCfg, Sequence[ByoBlockCfg]]) -> List[ByoBlockCfg]:
    if not isinstance(stage_blocks_cfg, Sequence):
        stage_blocks_cfg = (stage_blocks_cfg,)
@ -580,8 +757,8 @@ class BasicBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
-        if zero_init_last_bn:
+        if zero_init_last:
            nn.init.zeros_(self.conv2_kxk.bn.weight)
        for attn in (self.attn, self.attn_last):
            if hasattr(attn, 'reset_parameters'):
@ -637,8 +814,8 @@ class BottleneckBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
-        if zero_init_last_bn:
+        if zero_init_last:
            nn.init.zeros_(self.conv3_1x1.bn.weight)
        for attn in (self.attn, self.attn_last):
            if hasattr(attn, 'reset_parameters'):
@ -694,8 +871,8 @@ class DarkBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
-        if zero_init_last_bn:
+        if zero_init_last:
            nn.init.zeros_(self.conv2_kxk.bn.weight)
        for attn in (self.attn, self.attn_last):
            if hasattr(attn, 'reset_parameters'):
@ -747,8 +924,8 @@ class EdgeBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
-        if zero_init_last_bn:
+        if zero_init_last:
            nn.init.zeros_(self.conv2_1x1.bn.weight)
        for attn in (self.attn, self.attn_last):
            if hasattr(attn, 'reset_parameters'):
@ -790,7 +967,7 @@ class RepVggBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity()
        self.act = layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
        # NOTE this init overrides that base model init with specific changes for the block type
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
@ -847,8 +1024,8 @@ class SelfAttnBlock(nn.Module):
        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
        self.act = nn.Identity() if linear_out else layers.act(inplace=True)
-    def init_weights(self, zero_init_last_bn: bool = False):
+    def init_weights(self, zero_init_last: bool = False):
-        if zero_init_last_bn:
+        if zero_init_last:
            nn.init.zeros_(self.conv3_1x1.bn.weight)
        if hasattr(self.self_attn, 'reset_parameters'):
            self.self_attn.reset_parameters()
@ -990,27 +1167,29 @@ def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, mo
    layer_fns = block_kwargs['layers']
    # override attn layer / args with block local config
-    if block_cfg.attn_kwargs is not None or block_cfg.attn_layer is not None:
+    attn_set = block_cfg.attn_layer is not None
    if attn_set or block_cfg.attn_kwargs is not None:
        # override attn layer config
-        if not block_cfg.attn_layer:
+        if attn_set and not block_cfg.attn_layer:
            # empty string for attn_layer type will disable attn for this block
            attn_layer = None
        else:
            attn_kwargs = override_kwargs(block_cfg.attn_kwargs, model_cfg.attn_kwargs)
            attn_layer = block_cfg.attn_layer or model_cfg.attn_layer
-            attn_layer = partial(get_attn(attn_layer), *attn_kwargs) if attn_layer is not None else None
+            attn_layer = partial(get_attn(attn_layer), **attn_kwargs) if attn_layer is not None else None
        layer_fns = replace(layer_fns, attn=attn_layer)
    # override self-attn layer / args with block local cfg
-    if block_cfg.self_attn_kwargs is not None or block_cfg.self_attn_layer is not None:
+    self_attn_set = block_cfg.self_attn_layer is not None
    if self_attn_set or block_cfg.self_attn_kwargs is not None:
        # override attn layer config
-        if not block_cfg.self_attn_layer:
+        if self_attn_set and not block_cfg.self_attn_layer:  # attn_layer == ''
            # empty string for self_attn_layer type will disable attn for this block
            self_attn_layer = None
        else:
            self_attn_kwargs = override_kwargs(block_cfg.self_attn_kwargs, model_cfg.self_attn_kwargs)
            self_attn_layer = block_cfg.self_attn_layer or model_cfg.self_attn_layer
-            self_attn_layer = partial(get_attn(self_attn_layer), *self_attn_kwargs) \
+            self_attn_layer = partial(get_attn(self_attn_layer), **self_attn_kwargs) \
                if self_attn_layer is not None else None
        layer_fns = replace(layer_fns, self_attn=self_attn_layer)
@ -1099,7 +1278,7 @@ class ByobNet(nn.Module):
    Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act).
    """
    def __init__(self, cfg: ByoModelCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
-                 zero_init_last_bn=True, img_size=None, drop_rate=0., drop_path_rate=0.):
+                 zero_init_last=True, img_size=None, drop_rate=0., drop_path_rate=0.):
        super().__init__()
        self.num_classes = num_classes
        self.drop_rate = drop_rate
@ -1130,12 +1309,8 @@ class ByobNet(nn.Module):
        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
-        for n, m in self.named_modules():
+        # init weights
-            _init_weights(m, n)
+        named_apply(partial(_init_weights, zero_init_last=zero_init_last), self)
        for m in self.modules():
            # call each block's weight init for block-specific overrides to init above
            if hasattr(m, 'init_weights'):
                m.init_weights(zero_init_last_bn=zero_init_last_bn)
    def get_classifier(self):
        return self.head.fc
@ -1155,20 +1330,22 @@ class ByobNet(nn.Module):
        return x
-def _init_weights(m, n=''):
+def _init_weights(module, name='', zero_init_last=False):
-    if isinstance(m, nn.Conv2d):
+    if isinstance(module, nn.Conv2d):
-        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
-        fan_out //= m.groups
+        fan_out //= module.groups
-        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
-        if m.bias is not None:
+        if module.bias is not None:
-            m.bias.data.zero_()
+            module.bias.data.zero_()
-    elif isinstance(m, nn.Linear):
+    elif isinstance(module, nn.Linear):
-        nn.init.normal_(m.weight, mean=0.0, std=0.01)
+        nn.init.normal_(module.weight, mean=0.0, std=0.01)
-        if m.bias is not None:
+        if module.bias is not None:
-            nn.init.zeros_(m.bias)
+            nn.init.zeros_(module.bias)
-    elif isinstance(m, nn.BatchNorm2d):
+    elif isinstance(module, nn.BatchNorm2d):
-        nn.init.ones_(m.weight)
+        nn.init.ones_(module.weight)
-        nn.init.zeros_(m.bias)
+        nn.init.zeros_(module.bias)
    elif hasattr(module, 'init_weights'):
        module.init_weights(zero_init_last=zero_init_last)
 def _create_byobnet(variant, pretrained=False, **kwargs):
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -19,7 +19,6 @@ from .gather_excite import GatherExcite
 from .global_context import GlobalContext
 from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible
 from .inplace_abn import InplaceAbn
 from .involution import Involution
 from .linear import Linear
 from .mixed_conv2d import MixedConv2d
 from .mlp import Mlp, GluMlp, GatedMlp
--- a/timm/models/layers/attention_pool2d.py
+++ b/timm/models/layers/attention_pool2d.py
@ -0,0 +1,182 @@
 """ Attention Pool 2D
 Implementations of 2D spatial feature pooling using multi-head attention instead of average pool.
 Based on idea in CLIP by OpenAI, licensed Apache 2.0
 https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
 Hacked together by / Copyright 2021 Ross Wightman
 """
 import math
 from typing import List, Union, Tuple
 import torch
 import torch.nn as nn
 from .helpers import to_2tuple
 from .weight_init import trunc_normal_
 def rot(x):
    return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape)
 def apply_rot_embed(x: torch.Tensor, sin_emb, cos_emb):
    return x * cos_emb + rot(x) * sin_emb
 def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb):
    if isinstance(x, torch.Tensor):
        x = [x]
    return [t * cos_emb + rot(t) * sin_emb for t in x]
 class RotaryEmbedding(nn.Module):
    """ Rotary position embedding
    NOTE: This is my initial attempt at impl rotary embedding for spatial use, it has not
    been well tested, and will likely change. It will be moved to its own file.
    The following impl/resources were referenced for this impl:
    * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py
    * https://blog.eleuther.ai/rotary-embeddings/
    """
    def __init__(self, dim, max_freq=4):
        super().__init__()
        self.dim = dim
        self.register_buffer('bands', 2 ** torch.linspace(0., max_freq - 1, self.dim // 4), persistent=False)
    def get_embed(self, shape: torch.Size, device: torch.device = None, dtype: torch.dtype = None):
        """
        NOTE: shape arg should include spatial dim only
        """
        device = device or self.bands.device
        dtype = dtype or self.bands.dtype
        if not isinstance(shape, torch.Size):
            shape = torch.Size(shape)
        N = shape.numel()
        grid = torch.stack(torch.meshgrid(
            [torch.linspace(-1., 1., steps=s, device=device, dtype=dtype) for s in shape]), dim=-1).unsqueeze(-1)
        emb = grid * math.pi * self.bands
        sin = emb.sin().reshape(N, -1).repeat_interleave(2, -1)
        cos = emb.cos().reshape(N, -1).repeat_interleave(2, -1)
        return sin, cos
    def forward(self, x):
        # assuming channel-first tensor where spatial dim are >= 2
        sin_emb, cos_emb = self.get_embed(x.shape[2:])
        return apply_rot_embed(x, sin_emb, cos_emb)
 class RotAttentionPool2d(nn.Module):
    """ Attention based 2D feature pooling w/ rotary (relative) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
    Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed.
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
    NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from
    train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW
    """
    def __init__(
            self,
            in_features: int,
            out_features: int = None,
            embed_dim: int = None,
            num_heads: int = 4,
            qkv_bias: bool = True,
    ):
        super().__init__()
        embed_dim = embed_dim or in_features
        out_features = out_features or in_features
        self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(embed_dim, out_features)
        self.num_heads = num_heads
        assert embed_dim % num_heads == 0
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.pos_embed = RotaryEmbedding(self.head_dim)
        trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
        nn.init.zeros_(self.qkv.bias)
    def forward(self, x):
        B, _, H, W = x.shape
        N = H * W
        sin_emb, cos_emb = self.pos_embed.get_embed(x.shape[2:])
        x = x.reshape(B, -1, N).permute(0, 2, 1)
        x = torch.cat([x.mean(1, keepdim=True), x], dim=1)
        x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = x[0], x[1], x[2]
        qc, q = q[:, :, :1], q[:, :, 1:]
        q = apply_rot_embed(q, sin_emb, cos_emb)
        q = torch.cat([qc, q], dim=2)
        kc, k = k[:, :, :1], k[:, :, 1:]
        k = apply_rot_embed(k, sin_emb, cos_emb)
        k = torch.cat([kc, k], dim=2)
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1)
        x = self.proj(x)
        return x[:, 0]
 class AttentionPool2d(nn.Module):
    """ Attention based 2D feature pooling w/ learned (absolute) pos embedding.
    This is a multi-head attention based replacement for (spatial) average pooling in NN architectures.
    It was based on impl in CLIP by OpenAI
    https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py
    NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network.
    """
    def __init__(
            self,
            in_features: int,
            feat_size: Union[int, Tuple[int, int]],
            out_features: int = None,
            embed_dim: int = None,
            num_heads: int = 4,
            qkv_bias: bool = True,
    ):
        super().__init__()
        embed_dim = embed_dim or in_features
        out_features = out_features or in_features
        assert embed_dim % num_heads == 0
        self.feat_size = to_2tuple(feat_size)
        self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias)
        self.proj = nn.Linear(embed_dim, out_features)
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        spatial_dim = self.feat_size[0] * self.feat_size[1]
        self.pos_embed = nn.Parameter(torch.zeros(spatial_dim + 1, in_features))
        trunc_normal_(self.pos_embed, std=in_features ** -0.5)
        trunc_normal_(self.qkv.weight, std=in_features ** -0.5)
        nn.init.zeros_(self.qkv.bias)
    def forward(self, x):
        B, _, H, W = x.shape
        N = H * W
        assert self.feat_size[0] == H
        assert self.feat_size[1] == W
        x = x.reshape(B, -1, N).permute(0, 2, 1)
        x = torch.cat([x.mean(1, keepdim=True), x], dim=1)
        x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
        x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = x[0], x[1], x[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1)
        x = self.proj(x)
        return x[:, 0]
--- a/timm/models/layers/bottleneck_attn.py
+++ b/timm/models/layers/bottleneck_attn.py
@ -102,6 +102,8 @@ class BottleneckAttn(nn.Module):
        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
        self.reset_parameters()
    def reset_parameters(self):
        trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5)
        trunc_normal_(self.pos_embed.height_rel, std=self.scale)
@ -109,7 +111,8 @@ class BottleneckAttn(nn.Module):
    def forward(self, x):
        B, C, H, W = x.shape
-        assert H == self.pos_embed.height and W == self.pos_embed.width
+        assert H == self.pos_embed.height
        assert W == self.pos_embed.width
        x = self.qkv(x)  # B, 3 * num_heads * dim_head, H, W
        x = x.reshape(B, -1, self.dim_head, H * W).transpose(-1, -2)
@ -118,8 +121,8 @@ class BottleneckAttn(nn.Module):
        attn_logits = (q @ k.transpose(-1, -2)) * self.scale
        attn_logits = attn_logits + self.pos_embed(q)  # B, num_heads, H * W, H * W
-        attn_out = attn_logits.softmax(dim = -1)
+        attn_out = attn_logits.softmax(dim=-1)
-        attn_out = (attn_out @ v).transpose(1, 2).reshape(B, self.dim_out, H, W) # B, dim_out, H, W
+        attn_out = (attn_out @ v).transpose(1, 2).reshape(B, self.dim_out, H, W)  # B, dim_out, H, W
        attn_out = self.pool(attn_out)
        return attn_out
--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -11,13 +11,11 @@ from .eca import EcaModule, CecaModule
 from .gather_excite import GatherExcite
 from .global_context import GlobalContext
 from .halo_attn import HaloAttn
 from .involution import Involution
 from .lambda_layer import LambdaLayer
 from .non_local_attn import NonLocalAttn, BatNonLocalAttn
 from .selective_kernel import SelectiveKernel
 from .split_attn import SplitAttn
 from .squeeze_excite import SEModule, EffectiveSEModule
 from .swin_attn import WindowAttention
 def get_attn(attn_type):
@ -43,6 +41,8 @@ def get_attn(attn_type):
                module_cls = GatherExcite
            elif attn_type == 'gc':
                module_cls = GlobalContext
            elif attn_type == 'gca':
                module_cls = partial(GlobalContext, fuse_add=True, fuse_scale=False)
            elif attn_type == 'cbam':
                module_cls = CbamModule
            elif attn_type == 'lcbam':
@ -65,10 +65,6 @@ def get_attn(attn_type):
                return BottleneckAttn
            elif attn_type == 'halo':
                return HaloAttn
            elif attn_type == 'swin':
                return WindowAttention
            elif attn_type == 'involution':
                return Involution
            elif attn_type == 'nl':
                module_cls = NonLocalAttn
            elif attn_type == 'bat':
--- a/timm/models/layers/halo_attn.py
+++ b/timm/models/layers/halo_attn.py
@ -12,10 +12,7 @@ Year = {2021},
 Status:
 This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me.
-
+The attention mechanism works but it's slow as implemented.
 Trying to match the 'H1' variant in the paper, my parameter counts are 2M less and the model
 is extremely slow. Something isn't right. However, the models do appear to train and experimental
 variants with attn in C4 and/or C5 stages are tolerable speed.
 Hacked together by / Copyright 2021 Ross Wightman
 """
@ -103,14 +100,14 @@ class HaloAttn(nn.Module):
        - https://arxiv.org/abs/2103.12731
    """
    def __init__(
-            self, dim, dim_out=None, stride=1, num_heads=8, dim_head=16, block_size=8, halo_size=3, qkv_bias=False):
+            self, dim, dim_out=None, stride=1, num_heads=8, dim_head=None, block_size=8, halo_size=3, qkv_bias=False):
        super().__init__()
        dim_out = dim_out or dim
        assert dim_out % num_heads == 0
        self.stride = stride
        self.num_heads = num_heads
-        self.dim_head = dim_head
+        self.dim_head = dim_head or dim // num_heads
-        self.dim_qk = num_heads * dim_head
+        self.dim_qk = num_heads * self.dim_head
        self.dim_v = dim_out
        self.block_size = block_size
        self.halo_size = halo_size
@ -126,6 +123,8 @@ class HaloAttn(nn.Module):
        self.pos_embed = PosEmbedRel(
            block_size=block_size // self.stride, win_size=self.win_size, dim_head=self.dim_head, scale=self.scale)
        self.reset_parameters()
    def reset_parameters(self):
        std = self.q.weight.shape[1] ** -0.5  # fan-in
        trunc_normal_(self.q.weight, std=std)
@ -135,32 +134,46 @@ class HaloAttn(nn.Module):
    def forward(self, x):
        B, C, H, W = x.shape
-        assert H % self.block_size == 0 and W % self.block_size == 0
+        assert H % self.block_size == 0
        assert W % self.block_size == 0
        num_h_blocks = H // self.block_size
        num_w_blocks = W // self.block_size
        num_blocks = num_h_blocks * num_w_blocks
        bs_stride = self.block_size // self.stride
        q = self.q(x)
-        q = F.unfold(q, kernel_size=self.block_size // self.stride, stride=self.block_size // self.stride)
+        # unfold
        q = q.reshape(-1, self.dim_head, num_h_blocks, bs_stride, num_w_blocks, bs_stride).permute(0, 1, 3, 5, 2, 4)
        # B, num_heads * dim_head * block_size ** 2, num_blocks
        q = q.reshape(B * self.num_heads, self.dim_head, -1, num_blocks).transpose(1, 3)
        # B * num_heads, num_blocks, block_size ** 2, dim_head
        kv = self.kv(x)
-        # FIXME I 'think' this unfold does what I want it to, but I should investigate
+        # generate overlapping windows for kv
-        kv = F.unfold(kv, kernel_size=self.win_size, stride=self.block_size, padding=self.halo_size)
+        kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size])
-        kv = kv.reshape(
+        kv = kv.unfold(2, self.win_size, self.block_size).unfold(3, self.win_size, self.block_size).reshape(
-            B * self.num_heads, self.dim_head + (self.dim_v // self.num_heads), -1, num_blocks).transpose(1, 3)
+            B * self.num_heads, self.dim_head + (self.dim_v // self.num_heads), num_blocks, -1).permute(0, 2, 3, 1)
        # NOTE these two alternatives are equivalent, but above is the best balance of performance and clarity
        # if self.stride_tricks:
        #     kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size]).contiguous()
        #     kv = kv.as_strided((
        #         B, self.dim_qk + self.dim_v, self.win_size, self.win_size, num_h_blocks, num_w_blocks),
        #         stride=(kv.stride(0), kv.stride(1), kv.shape[-1], 1, self.block_size * kv.shape[-1], self.block_size))
        # else:
        #     kv = F.unfold(kv, kernel_size=self.win_size, stride=self.block_size, padding=self.halo_size)
        # kv = kv.reshape(
        #    B * self.num_heads, self.dim_head + (self.dim_v // self.num_heads), -1, num_blocks).transpose(1, 3)
        k, v = torch.split(kv, [self.dim_head, self.dim_v // self.num_heads], dim=-1)
        # B * num_heads, num_blocks, block_size ** 2, dim_head or dim_v // num_heads
        attn_logits = (q @ k.transpose(-1, -2)) * self.scale  # FIXME should usual attn scale be applied?
        attn_logits = attn_logits + self.pos_embed(q)  # B * num_heads, block_size ** 2, win_size ** 2
        attn_out = attn_logits.softmax(dim=-1)
        attn_out = (attn_out @ v).transpose(1, 3)  # B * num_heads, dim_v // num_heads, block_size ** 2, num_blocks
-        attn_out = F.fold(
+
-            attn_out.reshape(B, -1, num_blocks),
+        # fold
-            (H // self.stride, W // self.stride),
+        attn_out = attn_out.reshape(-1, bs_stride, bs_stride, num_h_blocks, num_w_blocks)
-            kernel_size=self.block_size // self.stride, stride=self.block_size // self.stride)
+        attn_out = attn_out.permute(0, 3, 1, 4, 2).contiguous().view(B, self.dim_v, H // self.stride, W // self.stride)
        # B, dim_out, H // stride, W // stride
        return attn_out
--- a/timm/models/layers/involution.py
+++ b/timm/models/layers/involution.py
@ -1,50 +0,0 @@
 """ PyTorch Involution Layer
 Official impl: https://github.com/d-li14/involution/blob/main/cls/mmcls/models/utils/involution_naive.py
 Paper: `Involution: Inverting the Inherence of Convolution for Visual Recognition` - https://arxiv.org/abs/2103.06255
 """
 import torch.nn as nn
 from .conv_bn_act import ConvBnAct
 from .create_conv2d import create_conv2d
 class Involution(nn.Module):
    def __init__(
            self,
            channels,
            kernel_size=3,
            stride=1,
            group_size=16,
            rd_ratio=4,
            norm_layer=nn.BatchNorm2d,
            act_layer=nn.ReLU,
    ):
        super(Involution, self).__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.channels = channels
        self.group_size = group_size
        self.groups = self.channels // self.group_size
        self.conv1 = ConvBnAct(
            in_channels=channels,
            out_channels=channels // rd_ratio,
            kernel_size=1,
            norm_layer=norm_layer,
            act_layer=act_layer)
        self.conv2 = self.conv = create_conv2d(
            in_channels=channels // rd_ratio,
            out_channels=kernel_size**2 * self.groups,
            kernel_size=1,
            stride=1)
        self.avgpool = nn.AvgPool2d(stride, stride) if stride == 2 else nn.Identity()
        self.unfold = nn.Unfold(kernel_size, 1, (kernel_size-1)//2, stride)
    def forward(self, x):
        weight = self.conv2(self.conv1(self.avgpool(x)))
        B, C, H, W = weight.shape
        KK = int(self.kernel_size ** 2)
        weight = weight.view(B, self.groups, KK, H, W).unsqueeze(2)
        out = self.unfold(x).view(B, self.groups, self.group_size, KK, H, W)
        out = (weight * out).sum(dim=3).view(B, self.channels, H, W)
        return out
--- a/timm/models/layers/lambda_layer.py
+++ b/timm/models/layers/lambda_layer.py
@ -57,6 +57,8 @@ class LambdaLayer(nn.Module):
        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
        self.reset_parameters()
    def reset_parameters(self):
        trunc_normal_(self.qkv.weight, std=self.dim ** -0.5)
        trunc_normal_(self.conv_lambda.weight, std=self.dim_k ** -0.5)
--- a/timm/models/layers/swin_attn.py
+++ b/timm/models/layers/swin_attn.py
@ -1,182 +0,0 @@
 """ Shifted Window Attn
 This is a WIP experiment to apply windowed attention from the Swin Transformer
 to a stand-alone module for use as an attn block in conv nets.
 Based on original swin window code at https://github.com/microsoft/Swin-Transformer
 Swin Transformer paper: https://arxiv.org/pdf/2103.14030.pdf
 """
 from typing import Optional
 import torch
 import torch.nn as nn
 from .drop import DropPath
 from .helpers import to_2tuple
 from .weight_init import trunc_normal_
 def window_partition(x, win_size: int):
    """
    Args:
        x: (B, H, W, C)
        win_size (int): window size
    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    """
    B, H, W, C = x.shape
    x = x.view(B, H // win_size, win_size, W // win_size, win_size, C)
    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, win_size, win_size, C)
    return windows
 def window_reverse(windows, win_size: int, H: int, W: int):
    """
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        win_size (int): Window size
        H (int): Height of image
        W (int): Width of image
    Returns:
        x: (B, H, W, C)
    """
    B = int(windows.shape[0] / (H * W / win_size / win_size))
    x = windows.view(B, H // win_size, W // win_size, win_size, win_size, -1)
    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
    return x
 class WindowAttention(nn.Module):
    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
    It supports both of shifted and non-shifted window.
    Args:
        dim (int): Number of input channels.
        win_size (int): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
    """
    def __init__(
            self, dim, dim_out=None, feat_size=None, stride=1, win_size=8, shift_size=None, num_heads=8,
            qkv_bias=True, attn_drop=0.):
        super().__init__()
        self.dim_out = dim_out or dim
        self.feat_size = to_2tuple(feat_size)
        self.win_size = win_size
        self.shift_size = shift_size or win_size // 2
        if min(self.feat_size) <= win_size:
            # if window size is larger than input resolution, we don't partition windows
            self.shift_size = 0
            self.win_size = min(self.feat_size)
        assert 0 <= self.shift_size < self.win_size, "shift_size must in 0-window_size"
        self.num_heads = num_heads
        head_dim = self.dim_out // num_heads
        self.scale = head_dim ** -0.5
        if self.shift_size > 0:
            # calculate attention mask for SW-MSA
            H, W = self.feat_size
            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
            h_slices = (
                slice(0, -self.win_size),
                slice(-self.win_size, -self.shift_size),
                slice(-self.shift_size, None))
            w_slices = (
                slice(0, -self.win_size),
                slice(-self.win_size, -self.shift_size),
                slice(-self.shift_size, None))
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1
            mask_windows = window_partition(img_mask, self.win_size)  # num_win, window_size, window_size, 1
            mask_windows = mask_windows.view(-1, self.win_size * self.win_size)
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
        else:
            attn_mask = None
        self.register_buffer("attn_mask", attn_mask)
        # define a parameter table of relative position bias
        self.relative_position_bias_table = nn.Parameter(
            # 2 * Wh - 1 * 2 * Ww - 1, nH
            torch.zeros((2 * self.win_size - 1) * (2 * self.win_size - 1), num_heads))
        trunc_normal_(self.relative_position_bias_table, std=.02)
        # get pair-wise relative position index for each token inside the window
        coords_h = torch.arange(self.win_size)
        coords_w = torch.arange(self.win_size)
        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += self.win_size - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.win_size - 1
        relative_coords[:, :, 0] *= 2 * self.win_size - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index", relative_position_index)
        self.qkv = nn.Linear(dim, self.dim_out * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.softmax = nn.Softmax(dim=-1)
        self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity()
    def reset_parameters(self):
        trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5)
        trunc_normal_(self.relative_position_bias_table, std=.02)
    def forward(self, x):
        B, C, H, W = x.shape
        x = x.permute(0, 2, 3, 1)
        # cyclic shift
        if self.shift_size > 0:
            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_x = x
        # partition windows
        win_size_sq = self.win_size * self.win_size
        x_windows = window_partition(shifted_x, self.win_size)  # num_win * B, window_size, window_size, C
        x_windows = x_windows.view(-1, win_size_sq, C)  # num_win * B, window_size*window_size, C
        BW, N, _ = x_windows.shape
        qkv = self.qkv(x_windows)
        qkv = qkv.reshape(BW, N, 3, self.num_heads, self.dim_out // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        q = q * self.scale
        attn = (q @ k.transpose(-2, -1))
        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.view(-1)].view(win_size_sq, win_size_sq, -1)
        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh * Ww, Wh * Ww
        attn = attn + relative_position_bias.unsqueeze(0)
        if self.attn_mask is not None:
            num_win = self.attn_mask.shape[0]
            attn = attn.view(B, num_win, self.num_heads, N, N) + self.attn_mask.unsqueeze(1).unsqueeze(0)
            attn = attn.view(-1, self.num_heads, N, N)
        attn = self.softmax(attn)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(BW, N, self.dim_out)
        # merge windows
        x = x.view(-1, self.win_size, self.win_size, self.dim_out)
        shifted_x = window_reverse(x, self.win_size, H, W)  # B H' W' C
        # reverse cyclic shift
        if self.shift_size > 0:
            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            x = shifted_x
        x = x.view(B, H, W, self.dim_out).permute(0, 3, 1, 2)
        x = self.pool(x)
        return x
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -50,7 +50,7 @@ default_cfgs = {
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26d-69e92c46.pth',
        interpolation='bicubic', first_conv='conv1.0'),
    'resnet26t': _cfg(
-        url='',
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet26t_256_ra2-6f6fa748.pth',
        interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8)),
    'resnet50': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50_ram-a26f946b.pth',
--- a/timm/models/vision_transformer.py
+++ b/timm/models/vision_transformer.py
@ -683,7 +683,8 @@ def vit_large_patch16_384(pretrained=False, **kwargs):
 def vit_base_patch16_sam_224(pretrained=False, **kwargs):
    """ ViT-Base (ViT-B/16) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
    """
-    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+    # NOTE original SAM weights releaes worked with representation_size=768
    model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=0, **kwargs)
    model = _create_vision_transformer('vit_base_patch16_sam_224', pretrained=pretrained, **model_kwargs)
    return model
@ -692,7 +693,8 @@ def vit_base_patch16_sam_224(pretrained=False, **kwargs):
 def vit_base_patch32_sam_224(pretrained=False, **kwargs):
    """ ViT-Base (ViT-B/32) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
    """
-    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=768, **kwargs)
+    # NOTE original SAM weights releaes worked with representation_size=768
    model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=0, **kwargs)
    model = _create_vision_transformer('vit_base_patch32_sam_224', pretrained=pretrained, **model_kwargs)
    return model
--- a/timm/scheduler/init.py
+++ b/timm/scheduler/init.py
@ -1,6 +1,9 @@
 from .cosine_lr import CosineLRScheduler
 from .multistep_lr import MultiStepLRScheduler
 from .plateau_lr import PlateauLRScheduler
 from .poly_lr import PolyLRScheduler
 from .step_lr import StepLRScheduler
 from .tanh_lr import TanhLRScheduler
 from .scheduler_factory import create_scheduler
 from .scheduler import Scheduler
--- a/timm/scheduler/cosine_lr.py
+++ b/timm/scheduler/cosine_lr.py
@ -1,8 +1,8 @@
 """ Cosine Scheduler
-Cosine LR schedule with warmup, cycle/restarts, noise.
+Cosine LR schedule with warmup, cycle/restarts, noise, k-decay.
-Hacked together by / Copyright 2020 Ross Wightman
+Hacked together by / Copyright 2021 Ross Wightman
 """
 import logging
 import math
@ -22,23 +22,26 @@ class CosineLRScheduler(Scheduler):
    Inspiration from
    https://github.com/allenai/allennlp/blob/master/allennlp/training/learning_rate_schedulers/cosine.py
    k-decay option based on `k-decay: A New Method For Learning Rate Schedule` - https://arxiv.org/abs/2004.05909
    """
    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 t_initial: int,
                 t_mul: float = 1.,
                 lr_min: float = 0.,
-                 decay_rate: float = 1.,
+                 cycle_mul: float = 1.,
                 cycle_decay: float = 1.,
                 cycle_limit: int = 1,
                 warmup_t=0,
                 warmup_lr_init=0,
                 warmup_prefix=False,
                 cycle_limit=0,
                 t_in_epochs=True,
                 noise_range_t=None,
                 noise_pct=0.67,
                 noise_std=1.0,
                 noise_seed=42,
                 k_decay=1.0,
                 initialize=True) -> None:
        super().__init__(
            optimizer, param_group_field="lr",
@ -47,18 +50,19 @@ class CosineLRScheduler(Scheduler):
        assert t_initial > 0
        assert lr_min >= 0
-        if t_initial == 1 and t_mul == 1 and decay_rate == 1:
+        if t_initial == 1 and cycle_mul == 1 and cycle_decay == 1:
            _logger.warning("Cosine annealing scheduler will have no effect on the learning "
                           "rate since t_initial = t_mul = eta_mul = 1.")
        self.t_initial = t_initial
        self.t_mul = t_mul
        self.lr_min = lr_min
-        self.decay_rate = decay_rate
+        self.cycle_mul = cycle_mul
        self.cycle_decay = cycle_decay
        self.cycle_limit = cycle_limit
        self.warmup_t = warmup_t
        self.warmup_lr_init = warmup_lr_init
        self.warmup_prefix = warmup_prefix
        self.t_in_epochs = t_in_epochs
        self.k_decay = k_decay
        if self.warmup_t:
            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
            super().update_groups(self.warmup_lr_init)
@ -72,22 +76,23 @@ class CosineLRScheduler(Scheduler):
            if self.warmup_prefix:
                t = t - self.warmup_t
-            if self.t_mul != 1:
+            if self.cycle_mul != 1:
-                i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul))
+                i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul))
-                t_i = self.t_mul ** i * self.t_initial
+                t_i = self.cycle_mul ** i * self.t_initial
-                t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial
+                t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial
            else:
                i = t // self.t_initial
                t_i = self.t_initial
                t_curr = t - (self.t_initial * i)
-            gamma = self.decay_rate ** i
+            gamma = self.cycle_decay ** i
            lr_min = self.lr_min * gamma
            lr_max_values = [v * gamma for v in self.base_values]
            k = self.k_decay
-            if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit):
+            if i < self.cycle_limit:
                lrs = [
-                    lr_min + 0.5 * (lr_max - lr_min) * (1 + math.cos(math.pi * t_curr / t_i)) for lr_max in lr_max_values
+                    self.lr_min + 0.5 * (lr_max - self.lr_min) * (1 + math.cos(math.pi * t_curr ** k / t_i ** k))
                    for lr_max in lr_max_values
                ]
            else:
                lrs = [self.lr_min for _ in self.base_values]
@ -107,10 +112,8 @@ class CosineLRScheduler(Scheduler):
            return None
    def get_cycle_length(self, cycles=0):
-        if not cycles:
+        cycles = max(1, cycles or self.cycle_limit)
-            cycles = self.cycle_limit
+        if self.cycle_mul == 1.0:
        cycles = max(1, cycles)
        if self.t_mul == 1.0:
            return self.t_initial * cycles
        else:
-            return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul)))
+            return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul)))
--- a/timm/scheduler/poly_lr.py
+++ b/timm/scheduler/poly_lr.py
@ -0,0 +1,116 @@
 """ Polynomial Scheduler
 Polynomial LR schedule with warmup, noise.
 Hacked together by / Copyright 2021 Ross Wightman
 """
 import math
 import logging
 import torch
 from .scheduler import Scheduler
 _logger = logging.getLogger(__name__)
 class PolyLRScheduler(Scheduler):
    """ Polynomial LR Scheduler w/ warmup, noise, and k-decay
    k-decay option based on `k-decay: A New Method For Learning Rate Schedule` - https://arxiv.org/abs/2004.05909
    """
    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 t_initial: int,
                 power: float = 0.5,
                 lr_min: float = 0.,
                 cycle_mul: float = 1.,
                 cycle_decay: float = 1.,
                 cycle_limit: int = 1,
                 warmup_t=0,
                 warmup_lr_init=0,
                 warmup_prefix=False,
                 t_in_epochs=True,
                 noise_range_t=None,
                 noise_pct=0.67,
                 noise_std=1.0,
                 noise_seed=42,
                 k_decay=.5,
                 initialize=True) -> None:
        super().__init__(
            optimizer, param_group_field="lr",
            noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed,
            initialize=initialize)
        assert t_initial > 0
        assert lr_min >= 0
        if t_initial == 1 and cycle_mul == 1 and cycle_decay == 1:
            _logger.warning("Cosine annealing scheduler will have no effect on the learning "
                            "rate since t_initial = t_mul = eta_mul = 1.")
        self.t_initial = t_initial
        self.power = power
        self.lr_min = lr_min
        self.cycle_mul = cycle_mul
        self.cycle_decay = cycle_decay
        self.cycle_limit = cycle_limit
        self.warmup_t = warmup_t
        self.warmup_lr_init = warmup_lr_init
        self.warmup_prefix = warmup_prefix
        self.t_in_epochs = t_in_epochs
        self.k_decay = k_decay
        if self.warmup_t:
            self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values]
            super().update_groups(self.warmup_lr_init)
        else:
            self.warmup_steps = [1 for _ in self.base_values]
    def _get_lr(self, t):
        if t < self.warmup_t:
            lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps]
        else:
            if self.warmup_prefix:
                t = t - self.warmup_t
            if self.cycle_mul != 1:
                i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul))
                t_i = self.cycle_mul ** i * self.t_initial
                t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial
            else:
                i = t // self.t_initial
                t_i = self.t_initial
                t_curr = t - (self.t_initial * i)
            gamma = self.cycle_decay ** i
            lr_max_values = [v * gamma for v in self.base_values]
            k = self.k_decay
            if i < self.cycle_limit:
                lrs = [
                    self.lr_min + (lr_max - self.lr_min) * (1 - t_curr ** k / t_i ** k) ** self.power
                    for lr_max in lr_max_values
                ]
            else:
                lrs = [self.lr_min for _ in self.base_values]
        return lrs
    def get_epoch_values(self, epoch: int):
        if self.t_in_epochs:
            return self._get_lr(epoch)
        else:
            return None
    def get_update_values(self, num_updates: int):
        if not self.t_in_epochs:
            return self._get_lr(num_updates)
        else:
            return None
    def get_cycle_length(self, cycles=0):
        cycles = max(1, cycles or self.cycle_limit)
        if self.cycle_mul == 1.0:
            return self.t_initial * cycles
        else:
            return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul)))
--- a/timm/scheduler/scheduler_factory.py
+++ b/timm/scheduler/scheduler_factory.py
@ -1,11 +1,12 @@
 """ Scheduler Factory
-Hacked together by / Copyright 2020 Ross Wightman
+Hacked together by / Copyright 2021 Ross Wightman
 """
 from .cosine_lr import CosineLRScheduler
 from .tanh_lr import TanhLRScheduler
 from .step_lr import StepLRScheduler
 from .plateau_lr import PlateauLRScheduler
 from .multistep_lr import MultiStepLRScheduler
 from .plateau_lr import PlateauLRScheduler
 from .poly_lr import PolyLRScheduler
 from .step_lr import StepLRScheduler
 from .tanh_lr import TanhLRScheduler
 def create_scheduler(args, optimizer):
@ -27,19 +28,22 @@ def create_scheduler(args, optimizer):
        noise_std=getattr(args, 'lr_noise_std', 1.),
        noise_seed=getattr(args, 'seed', 42),
    )
    cycle_args = dict(
        cycle_mul=getattr(args, 'lr_cycle_mul', 1.),
        cycle_decay=getattr(args, 'lr_cycle_decay', 0.1),
        cycle_limit=getattr(args, 'lr_cycle_limit', 1),
    )
    lr_scheduler = None
    if args.sched == 'cosine':
        lr_scheduler = CosineLRScheduler(
            optimizer,
            t_initial=num_epochs,
            t_mul=getattr(args, 'lr_cycle_mul', 1.),
            lr_min=args.min_lr,
            decay_rate=args.decay_rate,
            warmup_lr_init=args.warmup_lr,
            warmup_t=args.warmup_epochs,
-            cycle_limit=getattr(args, 'lr_cycle_limit', 1),
+            k_decay=getattr(args, 'lr_k_decay', 1.0),
-            t_in_epochs=True,
+            **cycle_args,
            **noise_args,
        )
        num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs
@ -47,12 +51,11 @@ def create_scheduler(args, optimizer):
        lr_scheduler = TanhLRScheduler(
            optimizer,
            t_initial=num_epochs,
            t_mul=getattr(args, 'lr_cycle_mul', 1.),
            lr_min=args.min_lr,
            warmup_lr_init=args.warmup_lr,
            warmup_t=args.warmup_epochs,
            cycle_limit=getattr(args, 'lr_cycle_limit', 1),
            t_in_epochs=True,
            **cycle_args,
            **noise_args,
        )
        num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs
@ -87,5 +90,18 @@ def create_scheduler(args, optimizer):
            cooldown_t=0,
            **noise_args,
        )
    elif args.sched == 'poly':
        lr_scheduler = PolyLRScheduler(
            optimizer,
            power=args.decay_rate,  # overloading 'decay_rate' as polynomial power
            t_initial=num_epochs,
            lr_min=args.min_lr,
            warmup_lr_init=args.warmup_lr,
            warmup_t=args.warmup_epochs,
            k_decay=getattr(args, 'lr_k_decay', 1.0),
            **cycle_args,
            **noise_args,
        )
        num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs
    return lr_scheduler, num_epochs
--- a/timm/scheduler/tanh_lr.py
+++ b/timm/scheduler/tanh_lr.py
@ -2,7 +2,7 @@
 TanH schedule with warmup, cycle/restarts, noise.
-Hacked together by / Copyright 2020 Ross Wightman
+Hacked together by / Copyright 2021 Ross Wightman
 """
 import logging
 import math
@ -24,15 +24,15 @@ class TanhLRScheduler(Scheduler):
    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 t_initial: int,
-                 lb: float = -6.,
+                 lb: float = -7.,
-                 ub: float = 4.,
+                 ub: float = 3.,
                 t_mul: float = 1.,
                 lr_min: float = 0.,
-                 decay_rate: float = 1.,
+                 cycle_mul: float = 1.,
                 cycle_decay: float = 1.,
                 cycle_limit: int = 1,
                 warmup_t=0,
                 warmup_lr_init=0,
                 warmup_prefix=False,
                 cycle_limit=0,
                 t_in_epochs=True,
                 noise_range_t=None,
                 noise_pct=0.67,
@ -53,9 +53,9 @@ class TanhLRScheduler(Scheduler):
        self.lb = lb
        self.ub = ub
        self.t_initial = t_initial
        self.t_mul = t_mul
        self.lr_min = lr_min
-        self.decay_rate = decay_rate
+        self.cycle_mul = cycle_mul
        self.cycle_decay = cycle_decay
        self.cycle_limit = cycle_limit
        self.warmup_t = warmup_t
        self.warmup_lr_init = warmup_lr_init
@ -75,27 +75,26 @@ class TanhLRScheduler(Scheduler):
            if self.warmup_prefix:
                t = t - self.warmup_t
-            if self.t_mul != 1:
+            if self.cycle_mul != 1:
-                i = math.floor(math.log(1 - t / self.t_initial * (1 - self.t_mul), self.t_mul))
+                i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul))
-                t_i = self.t_mul ** i * self.t_initial
+                t_i = self.cycle_mul ** i * self.t_initial
-                t_curr = t - (1 - self.t_mul ** i) / (1 - self.t_mul) * self.t_initial
+                t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial
            else:
                i = t // self.t_initial
                t_i = self.t_initial
                t_curr = t - (self.t_initial * i)
-            if self.cycle_limit == 0 or (self.cycle_limit > 0 and i < self.cycle_limit):
+            if i < self.cycle_limit:
-                gamma = self.decay_rate ** i
+                gamma = self.cycle_decay ** i
                lr_min = self.lr_min * gamma
                lr_max_values = [v * gamma for v in self.base_values]
                tr = t_curr / t_i
                lrs = [
-                    lr_min + 0.5 * (lr_max - lr_min) * (1 - math.tanh(self.lb * (1. - tr) + self.ub * tr))
+                    self.lr_min + 0.5 * (lr_max - self.lr_min) * (1 - math.tanh(self.lb * (1. - tr) + self.ub * tr))
                    for lr_max in lr_max_values
                ]
            else:
-                lrs = [self.lr_min * (self.decay_rate ** self.cycle_limit) for _ in self.base_values]
+                lrs = [self.lr_min for _ in self.base_values]
        return lrs
    def get_epoch_values(self, epoch: int):
@ -111,10 +110,8 @@ class TanhLRScheduler(Scheduler):
            return None
    def get_cycle_length(self, cycles=0):
-        if not cycles:
+        cycles = max(1, cycles or self.cycle_limit)
-            cycles = self.cycle_limit
+        if self.cycle_mul == 1.0:
        cycles = max(1, cycles)
        if self.t_mul == 1.0:
            return self.t_initial * cycles
        else:
-            return int(math.floor(-self.t_initial * (self.t_mul ** cycles - 1) / (1 - self.t_mul)))
+            return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul)))
--- a/timm/utils/metrics.py
+++ b/timm/utils/metrics.py
@ -24,9 +24,9 @@ class AverageMeter:
 def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
-    maxk = max(topk)
+    maxk = min(max(topk), output.size()[1])
    batch_size = target.size(0)
    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
-    return [correct[:k].reshape(-1).float().sum(0) * 100. / batch_size for k in topk]
+    return [correct[:min(k, maxk)].reshape(-1).float().sum(0) * 100. / batch_size for k in topk]