pytorch-image-models/timm/models/convmixer.py

""" ConvMixer

"""
import torch
import torch.nn as nn

from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from timm.models.registry import register_model
from .helpers import build_model_with_cfg, checkpoint_seq
from .layers import SelectAdaptivePool2d


def _cfg(url='', **kwargs):
    return {
        'url': url,
        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
        'crop_pct': .96, 'interpolation': 'bicubic',
        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head',
        'first_conv': 'stem.0',
        **kwargs
    }


default_cfgs = {
    'convmixer_1536_20': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1536_20_ks9_p7.pth.tar'),
    'convmixer_768_32': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_768_32_ks7_p7_relu.pth.tar'),
    'convmixer_1024_20_ks9_p14': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1024_20_ks9_p14.pth.tar')
}


class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x):
        return self.fn(x) + x


class ConvMixer(nn.Module):
    def __init__(
            self, dim, depth, kernel_size=9, patch_size=7, in_chans=3, num_classes=1000, global_pool='avg',
            act_layer=nn.GELU, **kwargs):
        super().__init__()
        self.num_classes = num_classes
        self.num_features = dim
        self.grad_checkpointing = False

        self.stem = nn.Sequential(
            nn.Conv2d(in_chans, dim, kernel_size=patch_size, stride=patch_size),
            act_layer(),
            nn.BatchNorm2d(dim)
        )
        self.blocks = nn.Sequential(
            *[nn.Sequential(
                    Residual(nn.Sequential(
                        nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),
                        act_layer(),
                        nn.BatchNorm2d(dim)
                    )),
                    nn.Conv2d(dim, dim, kernel_size=1),
                    act_layer(),
                    nn.BatchNorm2d(dim)
            ) for i in range(depth)]
        )
        self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)
        self.head = nn.Linear(dim, num_classes) if num_classes > 0 else nn.Identity()

    @torch.jit.ignore
    def group_matcher(self, coarse=False):
        matcher = dict(stem=r'^stem', blocks=r'^blocks\.(\d+)')
        return matcher

    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable

    @torch.jit.ignore
    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes, global_pool=None):
        self.num_classes = num_classes
        if global_pool is not None:
            self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)
        self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
          
    def forward_features(self, x):
        x = self.stem(x)
        if self.grad_checkpointing and not torch.jit.is_scripting():
            x = checkpoint_seq(self.blocks, x)
        else:
            x = self.blocks(x)
        return x

    def forward_head(self, x, pre_logits: bool = False):
        x = self.pooling(x)
        return x if pre_logits else self.head(x)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x


def _create_convmixer(variant, pretrained=False, **kwargs):
    return build_model_with_cfg(ConvMixer, variant, pretrained, **kwargs)


@register_model
def convmixer_1536_20(pretrained=False, **kwargs):
    model_args = dict(dim=1536, depth=20, kernel_size=9, patch_size=7, **kwargs)
    return _create_convmixer('convmixer_1536_20', pretrained, **model_args)


@register_model
def convmixer_768_32(pretrained=False, **kwargs):
    model_args = dict(dim=768, depth=32, kernel_size=7, patch_size=7, act_layer=nn.ReLU, **kwargs)
    return _create_convmixer('convmixer_768_32', pretrained, **model_args)


@register_model
def convmixer_1024_20_ks9_p14(pretrained=False, **kwargs):
    model_args = dict(dim=1024, depth=20, kernel_size=9, patch_size=14, **kwargs)
    return _create_convmixer('convmixer_1024_20_ks9_p14', pretrained, **model_args)
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`""" ConvMixer`

			`"""`
			`import torch`
Add ConvMixer 3 years ago			`import torch.nn as nn`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago
Add ConvMixer 3 years ago			`from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD`
			`from timm.models.registry import register_model`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`from .helpers import build_model_with_cfg, checkpoint_seq`
			`from .layers import SelectAdaptivePool2d`
Add ConvMixer 3 years ago

			`def _cfg(url='', **kwargs):`
			`return {`
			`'url': url,`
			`'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,`
			`'crop_pct': .96, 'interpolation': 'bicubic',`
			`'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head',`
			`'first_conv': 'stem.0',`
			`**kwargs`
			`}`


			`default_cfgs = {`
			`'convmixer_1536_20': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1536_20_ks9_p7.pth.tar'),`
			`'convmixer_768_32': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_768_32_ks7_p7_relu.pth.tar'),`
			`'convmixer_1024_20_ks9_p14': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1024_20_ks9_p14.pth.tar')`
			`}`


			`class Residual(nn.Module):`
			`def __init__(self, fn):`
			`super().__init__()`
			`self.fn = fn`

			`def forward(self, x):`
			`return self.fn(x) + x`


			`class ConvMixer(nn.Module):`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`def __init__(`
			`self, dim, depth, kernel_size=9, patch_size=7, in_chans=3, num_classes=1000, global_pool='avg',`
			`act_layer=nn.GELU, **kwargs):`
Add ConvMixer 3 years ago			`super().__init__()`
			`self.num_classes = num_classes`
			`self.num_features = dim`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`self.grad_checkpointing = False`

Add ConvMixer 3 years ago			`self.stem = nn.Sequential(`
			`nn.Conv2d(in_chans, dim, kernel_size=patch_size, stride=patch_size),`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`act_layer(),`
Add ConvMixer 3 years ago			`nn.BatchNorm2d(dim)`
			`)`
			`self.blocks = nn.Sequential(`
			`*[nn.Sequential(`
			`Residual(nn.Sequential(`
			`nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"),`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`act_layer(),`
Add ConvMixer 3 years ago			`nn.BatchNorm2d(dim)`
			`)),`
			`nn.Conv2d(dim, dim, kernel_size=1),`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`act_layer(),`
Add ConvMixer 3 years ago			`nn.BatchNorm2d(dim)`
			`) for i in range(depth)]`
			`)`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)`
			`self.head = nn.Linear(dim, num_classes) if num_classes > 0 else nn.Identity()`
Add ConvMixer 3 years ago
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`@torch.jit.ignore`
			`def group_matcher(self, coarse=False):`
			`matcher = dict(stem=r'^stem', blocks=r'^blocks\.(\d+)')`
			`return matcher`

			`@torch.jit.ignore`
			`def set_grad_checkpointing(self, enable=True):`
			`self.grad_checkpointing = enable`

			`@torch.jit.ignore`
Add ConvMixer 3 years ago			`def get_classifier(self):`
			`return self.head`

Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`def reset_classifier(self, num_classes, global_pool=None):`
Add ConvMixer 3 years ago			`self.num_classes = num_classes`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`if global_pool is not None:`
			`self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True)`
Add ConvMixer 3 years ago			`self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()`

			`def forward_features(self, x):`
			`x = self.stem(x)`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`if self.grad_checkpointing and not torch.jit.is_scripting():`
			`x = checkpoint_seq(self.blocks, x)`
			`else:`
			`x = self.blocks(x)`
Add ConvMixer 3 years ago			`return x`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago
			`def forward_head(self, x, pre_logits: bool = False):`
			`x = self.pooling(x)`
			`return x if pre_logits else self.head(x)`

Add ConvMixer 3 years ago			`def forward(self, x):`
			`x = self.forward_features(x)`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`x = self.forward_head(x)`
Add ConvMixer 3 years ago			`return x`


			`def _create_convmixer(variant, pretrained=False, **kwargs):`
Transitioning default_cfg -> pretrained_cfg. Improving handling of pretrained_cfg source (HF-Hub, files, timm config, etc). Checkpoint handling tweaks. 3 years ago			`return build_model_with_cfg(ConvMixer, variant, pretrained, **kwargs)`
Add ConvMixer 3 years ago

			`@register_model`
			`def convmixer_1536_20(pretrained=False, **kwargs):`
			`model_args = dict(dim=1536, depth=20, kernel_size=9, patch_size=7, **kwargs)`
			`return _create_convmixer('convmixer_1536_20', pretrained, **model_args)`


			`@register_model`
			`def convmixer_768_32(pretrained=False, **kwargs):`
Significant model refactor and additions: * All models updated with revised foward_features / forward_head interface * Vision transformer and MLP based models consistently output sequence from forward_features (pooling or token selection considered part of 'head') * WIP param grouping interface to allow consistent grouping of parameters for layer-wise decay across all model types * Add gradient checkpointing support to a significant % of models, especially popular architectures * Formatting and interface consistency improvements across models * layer-wise LR decay impl part of optimizer factory w/ scale support in scheduler * Poolformer and Volo architectures added 3 years ago			`model_args = dict(dim=768, depth=32, kernel_size=7, patch_size=7, act_layer=nn.ReLU, **kwargs)`
Add ConvMixer 3 years ago			`return _create_convmixer('convmixer_768_32', pretrained, **model_args)`


			`@register_model`
			`def convmixer_1024_20_ks9_p14(pretrained=False, **kwargs):`
			`model_args = dict(dim=1024, depth=20, kernel_size=9, patch_size=14, **kwargs)`
			`return _create_convmixer('convmixer_1024_20_ks9_p14', pretrained, **model_args)`