From ec6921fcb066678e58dcbb83b5b4a735103de174 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 21 Nov 2022 22:14:12 -0800 Subject: [PATCH] MaxVit, ViT, ConvNeXt, and EfficientNet-v2 updates * Add support for TF weights and modelling specifics to MaxVit (testing ported weights) * More fine-tuned CLIP ViT configs * ConvNeXt and MaxVit updated to new pretrained cfgs use * EfficientNetV2, MaxVit and ConvNeXt high res models use squash crop/resize --- timm/models/convnext.py | 303 ++--- timm/models/efficientnet.py | 16 +- timm/models/layers/activations.py | 14 + timm/models/layers/create_act.py | 2 + timm/models/maxxvit.py | 1865 +++++++++++++++++------------ timm/models/vision_transformer.py | 22 +- 6 files changed, 1259 insertions(+), 963 deletions(-) diff --git a/timm/models/convnext.py b/timm/models/convnext.py index 15000b40..e64bd0ef 100644 --- a/timm/models/convnext.py +++ b/timm/models/convnext.py @@ -21,111 +21,13 @@ from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import named_apply, build_model_with_cfg, checkpoint_seq from .layers import trunc_normal_, SelectAdaptivePool2d, DropPath, ConvMlp, Mlp, LayerNorm2d, LayerNorm, \ create_conv2d, get_act_layer, make_divisible, to_ntuple +from ._pretrained import generate_defaults from .registry import register_model __all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this -def _cfg(url='', **kwargs): - return { - 'url': url, - 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), - 'crop_pct': 0.875, 'interpolation': 'bicubic', - 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, - 'first_conv': 'stem.0', 'classifier': 'head.fc', - **kwargs - } - - -default_cfgs = dict( - # timm specific variants - convnext_atto=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth', - test_input_size=(3, 288, 288), test_crop_pct=0.95), - convnext_atto_ols=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth', - test_input_size=(3, 288, 288), test_crop_pct=0.95), - convnext_femto=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth', - test_input_size=(3, 288, 288), test_crop_pct=0.95), - convnext_femto_ols=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth', - test_input_size=(3, 288, 288), test_crop_pct=0.95), - convnext_pico=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth', - test_input_size=(3, 288, 288), test_crop_pct=0.95), - convnext_pico_ols=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth', - crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_nano=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth', - crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_nano_ols=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth', - crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_tiny_hnf=_cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth', - crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), - - convnext_tiny=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_small=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_base=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_large=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", - test_input_size=(3, 288, 288), test_crop_pct=1.0), - - convnext_tiny_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth', - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_small_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth', - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_base_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth', - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_large_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth', - test_input_size=(3, 288, 288), test_crop_pct=1.0), - convnext_xlarge_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth', - test_input_size=(3, 288, 288), test_crop_pct=1.0), - - convnext_tiny_384_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth', - input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), - convnext_small_384_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth', - input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), - convnext_base_384_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth', - input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), - convnext_large_384_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth', - input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), - convnext_xlarge_384_in22ft1k=_cfg( - url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth', - input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), - - convnext_tiny_in22k=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841), - convnext_small_in22k=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841), - convnext_base_in22k=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841), - convnext_large_in22k=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841), - convnext_xlarge_in22k=_cfg( - url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841), -) - - class ConvNeXtBlock(nn.Module): """ ConvNeXt Block There are two equivalent implementations: @@ -459,6 +361,107 @@ def _create_convnext(variant, pretrained=False, **kwargs): return model + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = generate_defaults({ + # timm specific variants + 'convnext_atto.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'convnext_atto_ols.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'convnext_femto.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'convnext_femto_ols.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'convnext_pico.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'convnext_pico_ols.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_nano.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_nano_ols.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_tiny_hnf.timm_in1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + + 'convnext_tiny.fb_in1k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_small.fb_in1k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_base.fb_in1k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_large.fb_in1k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_xlarge.untrained': _cfg(), + + 'convnext_tiny.fb_in22k_ft_in1k': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_small.fb_in22k_ft_in1k': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_base.fb_in22k_ft_in1k': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_large.fb_in22k_ft_in1k': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'convnext_xlarge.fb_in22k_ft_in1k': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + + 'convnext_tiny.fb_in22k_ft_in1k_384': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), + 'convnext_small..fb_in22k_ft_in1k_384': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), + 'convnext_base.fb_in22k_ft_in1k_384': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), + 'convnext_large.fb_in22k_ft_in1k_384': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), + 'convnext_xlarge.fb_in22k_ft_in1k_384': _cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), + + 'convnext_tiny_in22k.fb_in22k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841), + 'convnext_small_in22k.fb_in22k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841), + 'convnext_base_in22k.fb_in22k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841), + 'convnext_large_in22k.fb_in22k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841), + 'convnext_xlarge_in22k.fb_in22k': _cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841), +}) + + @register_model def convnext_atto(pretrained=False, **kwargs): # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M @@ -569,105 +572,7 @@ def convnext_large(pretrained=False, **kwargs): @register_model -def convnext_tiny_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) - model = _create_convnext('convnext_tiny_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_small_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) - model = _create_convnext('convnext_small_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_base_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) - model = _create_convnext('convnext_base_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_large_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) - model = _create_convnext('convnext_large_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_xlarge_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) - model = _create_convnext('convnext_xlarge_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_tiny_384_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) - model = _create_convnext('convnext_tiny_384_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_small_384_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) - model = _create_convnext('convnext_small_384_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_base_384_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) - model = _create_convnext('convnext_base_384_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_large_384_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) - model = _create_convnext('convnext_large_384_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_xlarge_384_in22ft1k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) - model = _create_convnext('convnext_xlarge_384_in22ft1k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_tiny_in22k(pretrained=False, **kwargs): - model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) - model = _create_convnext('convnext_tiny_in22k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_small_in22k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) - model = _create_convnext('convnext_small_in22k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_base_in22k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) - model = _create_convnext('convnext_base_in22k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_large_in22k(pretrained=False, **kwargs): - model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) - model = _create_convnext('convnext_large_in22k', pretrained=pretrained, **model_args) - return model - - -@register_model -def convnext_xlarge_in22k(pretrained=False, **kwargs): +def convnext_xlarge(pretrained=False, **kwargs): model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) - model = _create_convnext('convnext_xlarge_in22k', pretrained=pretrained, **model_args) + model = _create_convnext('convnext_xlarge', pretrained=pretrained, **model_args) return model diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py index 51c683c0..3c0efc96 100644 --- a/timm/models/efficientnet.py +++ b/timm/models/efficientnet.py @@ -366,11 +366,11 @@ default_cfgs = { 'tf_efficientnetv2_m': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_l': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_s_in21ft1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth', @@ -379,15 +379,15 @@ default_cfgs = { 'tf_efficientnetv2_m_in21ft1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_l_in21ft1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_xl_in21ft1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), - input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_s_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth', @@ -396,15 +396,15 @@ default_cfgs = { 'tf_efficientnetv2_m_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_l_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, - input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_xl_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, - input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), + input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'), 'tf_efficientnetv2_b0': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth', diff --git a/timm/models/layers/activations.py b/timm/models/layers/activations.py index e16b3bd3..2f5476c0 100644 --- a/timm/models/layers/activations.py +++ b/timm/models/layers/activations.py @@ -143,3 +143,17 @@ class GELU(nn.Module): def forward(self, input: torch.Tensor) -> torch.Tensor: return F.gelu(input) + + +def gelu_tanh(x: torch.Tensor, inplace: bool = False) -> torch.Tensor: + return F.gelu(x, approximate='tanh') + + +class GELUTanh(nn.Module): + """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg) + """ + def __init__(self, inplace: bool = False): + super(GELUTanh, self).__init__() + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.gelu(input, approximate='tanh') diff --git a/timm/models/layers/create_act.py b/timm/models/layers/create_act.py index a3044a3d..0b02398d 100644 --- a/timm/models/layers/create_act.py +++ b/timm/models/layers/create_act.py @@ -28,6 +28,7 @@ _ACT_FN_DEFAULT = dict( celu=F.celu, selu=F.selu, gelu=gelu, + gelu_tanh=gelu_tanh, sigmoid=sigmoid, tanh=tanh, hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid, @@ -71,6 +72,7 @@ _ACT_LAYER_DEFAULT = dict( celu=nn.CELU, selu=nn.SELU, gelu=GELU, + gelu_tanh=GELUTanh, sigmoid=Sigmoid, tanh=Tanh, hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoid, diff --git a/timm/models/maxxvit.py b/timm/models/maxxvit.py index bd529245..13fd7abf 100644 --- a/timm/models/maxxvit.py +++ b/timm/models/maxxvit.py @@ -52,114 +52,19 @@ from .helpers import build_model_with_cfg, checkpoint_seq, named_apply from .fx_features import register_notrace_function from .layers import Mlp, ConvMlp, DropPath, ClassifierHead, trunc_normal_tf_, LayerNorm2d, LayerNorm from .layers import create_attn, get_act_layer, get_norm_layer, get_norm_act_layer, create_conv2d +from .layers import SelectAdaptivePool2d, create_pool2d from .layers import to_2tuple, extend_tuple, make_divisible, _assert +from ._pretrained import generate_defaults from .registry import register_model from .vision_transformer_relpos import RelPosMlp, RelPosBias # FIXME move these to common location __all__ = ['MaxxVitCfg', 'MaxxVitConvCfg', 'MaxxVitTransformerCfg', 'MaxxVit'] -def _cfg(url='', **kwargs): - return { - 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), - 'crop_pct': 0.95, 'interpolation': 'bicubic', - 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), - 'first_conv': 'stem.conv1', 'classifier': 'head.fc', - 'fixed_input_size': True, - **kwargs - } - - -default_cfgs = { - # Fiddling with configs / defaults / still pretraining - 'coatnet_pico_rw_224': _cfg(url=''), - 'coatnet_nano_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_nano_rw_224_sw-f53093b4.pth', - crop_pct=0.9), - 'coatnet_0_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_0_rw_224_sw-a6439706.pth'), - 'coatnet_1_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth' - ), - 'coatnet_2_rw_224': _cfg(url=''), - 'coatnet_3_rw_224': _cfg(url=''), - - # Highly experimental configs - 'coatnet_bn_0_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_bn_0_rw_224_sw-c228e218.pth', - mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, - crop_pct=0.95), - 'coatnet_rmlp_nano_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_nano_rw_224_sw-bd1d51b3.pth', - crop_pct=0.9), - 'coatnet_rmlp_0_rw_224': _cfg(url=''), - 'coatnet_rmlp_1_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'), - 'coatnet_rmlp_2_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_2_rw_224_sw-5ccfac55.pth'), - 'coatnet_rmlp_3_rw_224': _cfg(url=''), - 'coatnet_nano_cc_224': _cfg(url=''), - 'coatnext_nano_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnext_nano_rw_224_ad-22cb71c2.pth', - crop_pct=0.9), - - # Trying to be like the CoAtNet paper configs - 'coatnet_0_224': _cfg(url=''), - 'coatnet_1_224': _cfg(url=''), - 'coatnet_2_224': _cfg(url=''), - 'coatnet_3_224': _cfg(url=''), - 'coatnet_4_224': _cfg(url=''), - 'coatnet_5_224': _cfg(url=''), - - # Experimental configs - 'maxvit_pico_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_nano_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_nano_rw_256_sw-fb127241.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_tiny_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_tiny_rw_224_sw-7d0dffeb.pth'), - 'maxvit_tiny_rw_256': _cfg( - url='', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_rmlp_pico_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_pico_rw_256_sw-8d82f2c6.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_rmlp_nano_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_rmlp_tiny_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxvit_rmlp_small_rw_224': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_small_rw_224_sw-6ef0ae4f.pth', - crop_pct=0.9, - ), - 'maxvit_rmlp_small_rw_256': _cfg( - url='', - input_size=(3, 256, 256), pool_size=(8, 8)), - - 'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), - - 'maxxvit_rmlp_nano_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_nano_rw_256_sw-0325d459.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxxvit_rmlp_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), - 'maxxvit_rmlp_small_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_small_rw_256_sw-37e217ff.pth', - input_size=(3, 256, 256), pool_size=(8, 8)), - - # Trying to be like the MaxViT paper configs - 'maxvit_tiny_224': _cfg(url=''), - 'maxvit_small_224': _cfg(url=''), - 'maxvit_base_224': _cfg(url=''), - 'maxvit_large_224': _cfg(url=''), - 'maxvit_xlarge_224': _cfg(url=''), -} - - @dataclass class MaxxVitTransformerCfg: dim_head: int = 32 + head_first: bool = True # head ordering in qkv channel dim expand_ratio: float = 4.0 expand_first: bool = True shortcut_bias: bool = True @@ -199,6 +104,7 @@ class MaxxVitConvCfg: stride_mode: str = 'dw' # stride done via one of 'pool', '1x1', 'dw' pool_type: str = 'avg2' downsample_pool_type: str = 'avg2' + padding: str = '' attn_early: bool = False # apply attn between conv2 and norm2, instead of after norm2 attn_layer: str = 'se' attn_act_layer: str = 'silu' @@ -228,499 +134,56 @@ class MaxxVitCfg: depths: Tuple[int, ...] = (2, 3, 5, 2) block_type: Tuple[Union[str, Tuple[str, ...]], ...] = ('C', 'C', 'T', 'T') stem_width: Union[int, Tuple[int, int]] = 64 - stem_bias: bool = True + stem_bias: bool = False conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg() transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg() + head_hidden_size: int = None weight_init: str = 'vit_eff' -def _rw_coat_cfg( - stride_mode='pool', - pool_type='avg2', - conv_output_bias=False, - conv_attn_early=False, - conv_attn_act_layer='relu', - conv_norm_layer='', - transformer_shortcut_bias=True, - transformer_norm_layer='layernorm2d', - transformer_norm_layer_cl='layernorm', - init_values=None, - rel_pos_type='bias', - rel_pos_dim=512, -): - # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit - # Common differences for initial timm models: - # - pre-norm layer in MZBConv included an activation after norm - # - mbconv expansion calculated from input instead of output chs - # - mbconv shortcut and final 1x1 conv did not have a bias - # - SE act layer was relu, not silu - # - mbconv uses silu in timm, not gelu - # - expansion in attention block done via output proj, not input proj - # Variable differences (evolved over training initial models): - # - avg pool with kernel_size=2 favoured downsampling (instead of maxpool for coat) - # - SE attention was between conv2 and norm/act - # - default to avg pool for mbconv downsample instead of 1x1 or dw conv - # - transformer block shortcut has no bias - return dict( - conv_cfg=MaxxVitConvCfg( - stride_mode=stride_mode, - pool_type=pool_type, - pre_norm_act=True, - expand_output=False, - output_bias=conv_output_bias, - attn_early=conv_attn_early, - attn_act_layer=conv_attn_act_layer, - act_layer='silu', - norm_layer=conv_norm_layer, - ), - transformer_cfg=MaxxVitTransformerCfg( - expand_first=False, - shortcut_bias=transformer_shortcut_bias, - pool_type=pool_type, - init_values=init_values, - norm_layer=transformer_norm_layer, - norm_layer_cl=transformer_norm_layer_cl, - rel_pos_type=rel_pos_type, - rel_pos_dim=rel_pos_dim, - ), - ) - +class Attention2d(nn.Module): + """ multi-head attention for 2D NCHW tensors""" + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + dim_head: int = 32, + bias: bool = True, + expand_first: bool = True, + head_first: bool = True, + rel_pos_cls: Callable = None, + attn_drop: float = 0., + proj_drop: float = 0. + ): + super().__init__() + dim_out = dim_out or dim + dim_attn = dim_out if expand_first else dim + self.num_heads = dim_attn // dim_head + self.dim_head = dim_head + self.head_first = head_first + self.scale = dim_head ** -0.5 -def _rw_max_cfg( - stride_mode='dw', - pool_type='avg2', - conv_output_bias=False, - conv_attn_ratio=1 / 16, - conv_norm_layer='', - transformer_norm_layer='layernorm2d', - transformer_norm_layer_cl='layernorm', - window_size=None, - dim_head=32, - init_values=None, - rel_pos_type='bias', - rel_pos_dim=512, -): - # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit - # Differences of initial timm models: - # - mbconv expansion calculated from input instead of output chs - # - mbconv shortcut and final 1x1 conv did not have a bias - # - mbconv uses silu in timm, not gelu - # - expansion in attention block done via output proj, not input proj - return dict( - conv_cfg=MaxxVitConvCfg( - stride_mode=stride_mode, - pool_type=pool_type, - expand_output=False, - output_bias=conv_output_bias, - attn_ratio=conv_attn_ratio, - act_layer='silu', - norm_layer=conv_norm_layer, - ), - transformer_cfg=MaxxVitTransformerCfg( - expand_first=False, - pool_type=pool_type, - dim_head=dim_head, - window_size=window_size, - init_values=init_values, - norm_layer=transformer_norm_layer, - norm_layer_cl=transformer_norm_layer_cl, - rel_pos_type=rel_pos_type, - rel_pos_dim=rel_pos_dim, - ), - ) + self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias) + self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Conv2d(dim_attn, dim_out, 1, bias=bias) + self.proj_drop = nn.Dropout(proj_drop) + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + B, C, H, W = x.shape -def _next_cfg( - stride_mode='dw', - pool_type='avg2', - conv_norm_layer='layernorm2d', - conv_norm_layer_cl='layernorm', - transformer_norm_layer='layernorm2d', - transformer_norm_layer_cl='layernorm', - window_size=None, - init_values=1e-6, - rel_pos_type='mlp', # MLP by default for maxxvit - rel_pos_dim=512, -): - # For experimental models with convnext instead of mbconv - init_values = to_2tuple(init_values) - return dict( - conv_cfg=MaxxVitConvCfg( - block_type='convnext', - stride_mode=stride_mode, - pool_type=pool_type, - expand_output=False, - init_values=init_values[0], - norm_layer=conv_norm_layer, - norm_layer_cl=conv_norm_layer_cl, - ), - transformer_cfg=MaxxVitTransformerCfg( - expand_first=False, - pool_type=pool_type, - window_size=window_size, - init_values=init_values[1], - norm_layer=transformer_norm_layer, - norm_layer_cl=transformer_norm_layer_cl, - rel_pos_type=rel_pos_type, - rel_pos_dim=rel_pos_dim, - ), - ) + if self.head_first: + q, k, v = self.qkv(x).view(B, self.num_heads, self.dim_head * 3, -1).chunk(3, dim=2) + else: + q, k, v = self.qkv(x).reshape(B, 3, self.num_heads, self.dim_head, -1).unbind(1) - -model_cfgs = dict( - # Fiddling with configs / defaults / still pretraining - coatnet_pico_rw_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 3, 5, 2), - stem_width=(32, 64), - **_rw_max_cfg( # using newer max defaults here - conv_output_bias=True, - conv_attn_ratio=0.25, - ), - ), - coatnet_nano_rw_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(3, 4, 6, 3), - stem_width=(32, 64), - **_rw_max_cfg( # using newer max defaults here - stride_mode='pool', - conv_output_bias=True, - conv_attn_ratio=0.25, - ), - ), - coatnet_0_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 3, 7, 2), # deeper than paper '0' model - stem_width=(32, 64), - **_rw_coat_cfg( - conv_attn_early=True, - transformer_shortcut_bias=False, - ), - ), - coatnet_1_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 6, 14, 2), - stem_width=(32, 64), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_early=True, - transformer_shortcut_bias=False, - ) - ), - coatnet_2_rw_224=MaxxVitCfg( - embed_dim=(128, 256, 512, 1024), - depths=(2, 6, 14, 2), - stem_width=(64, 128), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_act_layer='silu', - init_values=1e-6, - ), - ), - coatnet_3_rw_224=MaxxVitCfg( - embed_dim=(192, 384, 768, 1536), - depths=(2, 6, 14, 2), - stem_width=(96, 192), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_act_layer='silu', - init_values=1e-6, - ), - ), - - # Highly experimental configs - coatnet_bn_0_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 3, 7, 2), # deeper than paper '0' model - stem_width=(32, 64), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_early=True, - transformer_shortcut_bias=False, - transformer_norm_layer='batchnorm2d', - ) - ), - coatnet_rmlp_nano_rw_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(3, 4, 6, 3), - stem_width=(32, 64), - **_rw_max_cfg( - conv_output_bias=True, - conv_attn_ratio=0.25, - rel_pos_type='mlp', - rel_pos_dim=384, - ), - ), - coatnet_rmlp_0_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 3, 7, 2), # deeper than paper '0' model - stem_width=(32, 64), - **_rw_coat_cfg( - stride_mode='dw', - rel_pos_type='mlp', - ), - ), - coatnet_rmlp_1_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 6, 14, 2), - stem_width=(32, 64), - **_rw_coat_cfg( - pool_type='max', - conv_attn_early=True, - transformer_shortcut_bias=False, - rel_pos_type='mlp', - rel_pos_dim=384, # was supposed to be 512, woops - ), - ), - coatnet_rmlp_2_rw_224=MaxxVitCfg( - embed_dim=(128, 256, 512, 1024), - depths=(2, 6, 14, 2), - stem_width=(64, 128), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_act_layer='silu', - init_values=1e-6, - rel_pos_type='mlp' - ), - ), - coatnet_rmlp_3_rw_224=MaxxVitCfg( - embed_dim=(192, 384, 768, 1536), - depths=(2, 6, 14, 2), - stem_width=(96, 192), - **_rw_coat_cfg( - stride_mode='dw', - conv_attn_act_layer='silu', - init_values=1e-6, - rel_pos_type='mlp' - ), - ), - - coatnet_nano_cc_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(3, 4, 6, 3), - stem_width=(32, 64), - block_type=('C', 'C', ('C', 'T'), ('C', 'T')), - **_rw_coat_cfg(), - ), - coatnext_nano_rw_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(3, 4, 6, 3), - stem_width=(32, 64), - weight_init='normal', - **_next_cfg( - rel_pos_type='bias', - init_values=(1e-5, None) - ), - ), - - # Trying to be like the CoAtNet paper configs - coatnet_0_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 3, 5, 2), - stem_width=64, - ), - coatnet_1_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 6, 14, 2), - stem_width=64, - ), - coatnet_2_224=MaxxVitCfg( - embed_dim=(128, 256, 512, 1024), - depths=(2, 6, 14, 2), - stem_width=128, - ), - coatnet_3_224=MaxxVitCfg( - embed_dim=(192, 384, 768, 1536), - depths=(2, 6, 14, 2), - stem_width=192, - ), - coatnet_4_224=MaxxVitCfg( - embed_dim=(192, 384, 768, 1536), - depths=(2, 12, 28, 2), - stem_width=192, - ), - coatnet_5_224=MaxxVitCfg( - embed_dim=(256, 512, 1280, 2048), - depths=(2, 12, 28, 2), - stem_width=192, - ), - - # Experimental MaxVit configs - maxvit_pico_rw_256=MaxxVitCfg( - embed_dim=(32, 64, 128, 256), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(24, 32), - **_rw_max_cfg(), - ), - maxvit_nano_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(1, 2, 3, 1), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(), - ), - maxvit_tiny_rw_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(), - ), - maxvit_tiny_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(), - ), - - maxvit_rmlp_pico_rw_256=MaxxVitCfg( - embed_dim=(32, 64, 128, 256), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(24, 32), - **_rw_max_cfg(rel_pos_type='mlp'), - ), - maxvit_rmlp_nano_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(1, 2, 3, 1), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(rel_pos_type='mlp'), - ), - maxvit_rmlp_tiny_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(rel_pos_type='mlp'), - ), - maxvit_rmlp_small_rw_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg( - rel_pos_type='mlp', - init_values=1e-6, - ), - ), - maxvit_rmlp_small_rw_256=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_rw_max_cfg( - rel_pos_type='mlp', - init_values=1e-6, - ), - ), - - maxvit_tiny_pm_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('PM',) * 4, - stem_width=(32, 64), - **_rw_max_cfg(), - ), - - maxxvit_rmlp_nano_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(1, 2, 3, 1), - block_type=('M',) * 4, - stem_width=(32, 64), - weight_init='normal', - **_next_cfg(), - ), - maxxvit_rmlp_tiny_rw_256=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(32, 64), - **_next_cfg(), - ), - maxxvit_rmlp_small_rw_256=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=(48, 96), - **_next_cfg(), - ), - - # Trying to be like the MaxViT paper configs - maxvit_tiny_224=MaxxVitCfg( - embed_dim=(64, 128, 256, 512), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=64, - ), - maxvit_small_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 2, 5, 2), - block_type=('M',) * 4, - stem_width=64, - ), - maxvit_base_224=MaxxVitCfg( - embed_dim=(96, 192, 384, 768), - depths=(2, 6, 14, 2), - block_type=('M',) * 4, - stem_width=64, - ), - maxvit_large_224=MaxxVitCfg( - embed_dim=(128, 256, 512, 1024), - depths=(2, 6, 14, 2), - block_type=('M',) * 4, - stem_width=128, - ), - maxvit_xlarge_224=MaxxVitCfg( - embed_dim=(192, 384, 768, 1536), - depths=(2, 6, 14, 2), - block_type=('M',) * 4, - stem_width=192, - ), - -) - - -class Attention2d(nn.Module): - """ multi-head attention for 2D NCHW tensors""" - def __init__( - self, - dim: int, - dim_out: Optional[int] = None, - dim_head: int = 32, - bias: bool = True, - expand_first: bool = True, - rel_pos_cls: Callable = None, - attn_drop: float = 0., - proj_drop: float = 0. - ): - super().__init__() - dim_out = dim_out or dim - dim_attn = dim_out if expand_first else dim - self.num_heads = dim_attn // dim_head - self.dim_head = dim_head - self.scale = dim_head ** -0.5 - - self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias) - self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Conv2d(dim_attn, dim_out, 1, bias=bias) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): - B, C, H, W = x.shape - - q, k, v = self.qkv(x).view(B, self.num_heads, self.dim_head * 3, -1).chunk(3, dim=2) - - attn = (q.transpose(-2, -1) @ k) * self.scale - if self.rel_pos is not None: - attn = self.rel_pos(attn) - elif shared_rel_pos is not None: - attn = attn + shared_rel_pos - attn = attn.softmax(dim=-1) - attn = self.attn_drop(attn) + attn = (q.transpose(-2, -1) @ k) * self.scale + if self.rel_pos is not None: + attn = self.rel_pos(attn) + elif shared_rel_pos is not None: + attn = attn + shared_rel_pos + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) x = self.proj(x) @@ -737,6 +200,7 @@ class AttentionCl(nn.Module): dim_head: int = 32, bias: bool = True, expand_first: bool = True, + head_first: bool = True, rel_pos_cls: Callable = None, attn_drop: float = 0., proj_drop: float = 0. @@ -747,6 +211,7 @@ class AttentionCl(nn.Module): assert dim_attn % dim_head == 0, 'attn dim should be divisible by head_dim' self.num_heads = dim_attn // dim_head self.dim_head = dim_head + self.head_first = head_first self.scale = dim_head ** -0.5 self.qkv = nn.Linear(dim, dim_attn * 3, bias=bias) @@ -759,7 +224,10 @@ class AttentionCl(nn.Module): B = x.shape[0] restore_shape = x.shape[:-1] - q, k, v = self.qkv(x).view(B, -1, self.num_heads, self.dim_head * 3).transpose(1, 2).chunk(3, dim=3) + if self.head_first: + q, k, v = self.qkv(x).view(B, -1, self.num_heads, self.dim_head * 3).transpose(1, 2).chunk(3, dim=3) + else: + q, k, v = self.qkv(x).reshape(B, -1, 3, self.num_heads, self.dim_head).transpose(1, 3).unbind(2) attn = (q @ k.transpose(-2, -1)) * self.scale if self.rel_pos is not None: @@ -810,18 +278,20 @@ class Downsample2d(nn.Module): dim: int, dim_out: int, pool_type: str = 'avg2', + padding: str = '', bias: bool = True, ): super().__init__() assert pool_type in ('max', 'max2', 'avg', 'avg2') if pool_type == 'max': - self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.pool = create_pool2d('max', kernel_size=3, stride=2, padding=padding or 1) elif pool_type == 'max2': - self.pool = nn.MaxPool2d(2) # kernel_size == stride == 2 + self.pool = create_pool2d('max', 2, padding=padding or 0) # kernel_size == stride == 2 elif pool_type == 'avg': - self.pool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1, count_include_pad=False) + self.pool = create_pool2d( + 'avg', kernel_size=3, stride=2, count_include_pad=False, padding=padding or 1) else: - self.pool = nn.AvgPool2d(2) # kernel_size == stride == 2 + self.pool = create_pool2d('avg', 2, padding=padding or 0) if dim != dim_out: self.expand = nn.Conv2d(dim, dim_out, 1, bias=bias) @@ -973,7 +443,8 @@ class MbConvBlock(nn.Module): groups = num_groups(cfg.group_size, mid_chs) if stride == 2: - self.shortcut = Downsample2d(in_chs, out_chs, pool_type=cfg.pool_type, bias=cfg.output_bias) + self.shortcut = Downsample2d( + in_chs, out_chs, pool_type=cfg.pool_type, bias=cfg.output_bias, padding=cfg.padding) else: self.shortcut = nn.Identity() @@ -991,14 +462,15 @@ class MbConvBlock(nn.Module): self.pre_norm = norm_act_layer(in_chs, apply_act=cfg.pre_norm_act) if stride_pool > 1: - self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type) + self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type, padding=cfg.padding) else: self.down = nn.Identity() self.conv1_1x1 = create_conv2d(in_chs, mid_chs, 1, stride=stride_1) self.norm1 = norm_act_layer(mid_chs) self.conv2_kxk = create_conv2d( - mid_chs, mid_chs, cfg.kernel_size, stride=stride_2, dilation=dilation_2, groups=groups) + mid_chs, mid_chs, cfg.kernel_size, + stride=stride_2, dilation=dilation_2, groups=groups, padding=cfg.padding) attn_kwargs = {} if isinstance(cfg.attn_layer, str): @@ -1164,6 +636,8 @@ def get_rel_pos_cls(cfg: MaxxVitTransformerCfg, window_size): rel_pos_cls = partial(RelPosMlp, window_size=window_size, hidden_dim=cfg.rel_pos_dim) elif cfg.rel_pos_type == 'bias': rel_pos_cls = partial(RelPosBias, window_size=window_size) + elif cfg.rel_pos_type == 'bias_tf': + rel_pos_cls = partial(RelPosBiasTf, window_size=window_size) return rel_pos_cls @@ -1193,6 +667,7 @@ class PartitionAttentionCl(nn.Module): dim, dim_head=cfg.dim_head, bias=cfg.attn_bias, + head_first=cfg.head_first, rel_pos_cls=rel_pos_cls, attn_drop=cfg.attn_drop, proj_drop=cfg.proj_drop, @@ -1256,6 +731,7 @@ class ParallelPartitionAttention(nn.Module): dim // 2, dim_head=cfg.dim_head, bias=cfg.attn_bias, + head_first=cfg.head_first, rel_pos_cls=rel_pos_cls, attn_drop=cfg.attn_drop, proj_drop=cfg.proj_drop, @@ -1265,6 +741,7 @@ class ParallelPartitionAttention(nn.Module): dim // 2, dim_head=cfg.dim_head, bias=cfg.attn_bias, + head_first=cfg.head_first, rel_pos_cls=rel_pos_cls, attn_drop=cfg.attn_drop, proj_drop=cfg.proj_drop, @@ -1364,6 +841,7 @@ class PartitionAttention2d(nn.Module): dim, dim_head=cfg.dim_head, bias=cfg.attn_bias, + head_first=cfg.head_first, rel_pos_cls=rel_pos_cls, attn_drop=cfg.attn_drop, proj_drop=cfg.proj_drop, @@ -1413,6 +891,7 @@ class MaxxVitBlock(nn.Module): conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(), transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), use_nchw_attn: bool = False, # FIXME move to cfg? True is ~20-30% faster on TPU, 5-10% slower on GPU + use_block_attn: bool = True, # FIXME for testing ConvNeXt conv w/o block attention drop_path: float = 0., ): super().__init__() @@ -1423,11 +902,12 @@ class MaxxVitBlock(nn.Module): attn_kwargs = dict(dim=dim_out, cfg=transformer_cfg, drop_path=drop_path) partition_layer = PartitionAttention2d if use_nchw_attn else PartitionAttentionCl self.nchw_attn = use_nchw_attn - self.attn_block = partition_layer(**attn_kwargs) + self.attn_block = partition_layer(**attn_kwargs) if use_block_attn else None self.attn_grid = partition_layer(partition_type='grid', **attn_kwargs) def init_weights(self, scheme=''): - named_apply(partial(_init_transformer, scheme=scheme), self.attn_block) + if self.attn_block is not None: + named_apply(partial(_init_transformer, scheme=scheme), self.attn_block) named_apply(partial(_init_transformer, scheme=scheme), self.attn_grid) named_apply(partial(_init_conv, scheme=scheme), self.conv) @@ -1437,7 +917,8 @@ class MaxxVitBlock(nn.Module): if not self.nchw_attn: x = x.permute(0, 2, 3, 1) # to NHWC (channels-last) - x = self.attn_block(x) + if self.attn_block is not None: + x = self.attn_block(x) x = self.attn_grid(x) if not self.nchw_attn: x = x.permute(0, 3, 1, 2) # back to NCHW @@ -1544,183 +1025,1004 @@ class MaxxVitStage(nn.Module): self.blocks = nn.Sequential(*blocks) def forward(self, x): - if self.grad_checkpointing and not torch.jit.is_scripting(): - x = checkpoint_seq(self.blocks, x) - else: - x = self.blocks(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + +class Stem(nn.Module): + + def __init__( + self, + in_chs: int, + out_chs: int, + kernel_size: int = 3, + padding: str = '', + bias: bool = False, + act_layer: str = 'gelu', + norm_layer: str = 'batchnorm2d', + norm_eps: float = 1e-5, + ): + super().__init__() + if not isinstance(out_chs, (list, tuple)): + out_chs = to_2tuple(out_chs) + + norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps) + self.out_chs = out_chs[-1] + self.stride = 2 + + self.conv1 = create_conv2d(in_chs, out_chs[0], kernel_size, stride=2, padding=padding, bias=bias) + self.norm1 = norm_act_layer(out_chs[0]) + self.conv2 = create_conv2d(out_chs[0], out_chs[1], kernel_size, stride=1, padding=padding, bias=bias) + + def init_weights(self, scheme=''): + named_apply(partial(_init_conv, scheme=scheme), self) + + def forward(self, x): + x = self.conv1(x) + x = self.norm1(x) + x = self.conv2(x) + return x + + +def cfg_window_size(cfg: MaxxVitTransformerCfg, img_size: Tuple[int, int]): + if cfg.window_size is not None: + assert cfg.grid_size + return cfg + partition_size = img_size[0] // cfg.partition_ratio, img_size[1] // cfg.partition_ratio + cfg = replace(cfg, window_size=partition_size, grid_size=partition_size) + return cfg + + +def generate_lookup_tensor( + length: int, + max_relative_position: Optional[int] = None, +): + """Generate a one_hot lookup tensor to reindex embeddings along one dimension. + Args: + length: the length to reindex to. + max_relative_position: the maximum relative position to consider. + Relative position embeddings for distances above this threshold + are zeroed out. + Returns: + a lookup Tensor of size [length, length, vocab_size] that satisfies + ret[n,m,v] = 1{m - n + max_relative_position = v}. + """ + if max_relative_position is None: + max_relative_position = length - 1 + # Return the cached lookup tensor, otherwise compute it and cache it. + vocab_size = 2 * max_relative_position + 1 + ret = torch.zeros(length, length, vocab_size) + for i in range(length): + for x in range(length): + v = x - i + max_relative_position + if abs(x - i) > max_relative_position: + continue + ret[i, x, v] = 1 + return ret + + +def reindex_2d_einsum_lookup( + relative_position_tensor, + height: int, + width: int, + height_lookup: torch.Tensor, + width_lookup: torch.Tensor, +) -> torch.Tensor: + """Reindex 2d relative position bias with 2 independent einsum lookups. + Args: + relative_position_tensor: tensor of shape + [..., vocab_height, vocab_width, ...]. + height: height to reindex to. + width: width to reindex to. + height_lookup: one-hot height lookup + width_lookup: one-hot width lookup + Returns: + reindexed_tensor: a Tensor of shape + [..., height * width, height * width, ...] + """ + reindexed_tensor = torch.einsum('nhw,ixh->nixw', relative_position_tensor, height_lookup) + reindexed_tensor = torch.einsum('nixw,jyw->nijxy', reindexed_tensor, width_lookup) + area = height * width + return reindexed_tensor.reshape(relative_position_tensor.shape[0], area, area) + + +class RelPosBiasTf(nn.Module): + + def __init__(self, window_size, num_heads, prefix_tokens=0): + super().__init__() + assert prefix_tokens <= 1 + self.window_size = window_size + self.window_area = window_size[0] * window_size[1] + self.num_heads = num_heads + + vocab_height = 2 * window_size[0] - 1 + vocab_width = 2 * window_size[1] - 1 + self.bias_shape = (self.num_heads, vocab_height, vocab_width) + self.relative_position_bias_table = nn.Parameter(torch.zeros(self.bias_shape)) + self.register_buffer('height_lookup', generate_lookup_tensor(window_size[0]), persistent=False) + self.register_buffer('width_lookup', generate_lookup_tensor(window_size[1]), persistent=False) + self.init_weights() + + def init_weights(self): + nn.init.normal_(self.relative_position_bias_table, std=.02) + + def get_bias(self) -> torch.Tensor: + # FIXME change to not use one-hot/einsum? + return reindex_2d_einsum_lookup( + self.relative_position_bias_table, + self.window_size[0], + self.window_size[1], + self.height_lookup, + self.width_lookup + ) + + def forward(self, attn, shared_rel_pos: Optional[torch.Tensor] = None): + return attn + self.get_bias() + + +class NormMlpHead(nn.Module): + + def __init__( + self, + in_features, + num_classes, + hidden_size=None, + pool_type='avg', + drop_rate=0., + norm_layer=nn.LayerNorm, + act_layer=nn.Tanh, + ): + super().__init__() + self.drop_rate = drop_rate + self.num_features = in_features + + self.global_pool = SelectAdaptivePool2d(pool_type=pool_type) + self.norm = norm_layer(in_features) + self.flatten = nn.Flatten(1) if pool_type else nn.Identity() + if hidden_size: + self.pre_logits = nn.Sequential(OrderedDict([ + ('fc', nn.Linear(in_features, hidden_size)), + ('act', act_layer()), + ])) + self.num_features = hidden_size + else: + self.pre_logits = nn.Identity() + self.drop = nn.Dropout(self.drop_rate) + self.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward(self, x, pre_logits: bool = False): + x = self.global_pool(x) + x = self.norm(x) + x = self.flatten(x) + x = self.pre_logits(x) + if pre_logits: + return x + x = self.fc(x) + return x + + +class MaxxVit(nn.Module): + """ CoaTNet + MaxVit base model. + + Highly configurable for different block compositions, tensor layouts, pooling types. + """ + + def __init__( + self, + cfg: MaxxVitCfg, + img_size: Union[int, Tuple[int, int]] = 224, + in_chans: int = 3, + num_classes: int = 1000, + global_pool: str = 'avg', + drop_rate: float = 0., + drop_path_rate: float = 0. + ): + super().__init__() + img_size = to_2tuple(img_size) + transformer_cfg = cfg_window_size(cfg.transformer_cfg, img_size) + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = cfg.embed_dim[-1] + self.drop_rate = drop_rate + self.grad_checkpointing = False + + self.stem = Stem( + in_chs=in_chans, + out_chs=cfg.stem_width, + padding=cfg.conv_cfg.padding, + bias=cfg.stem_bias, + act_layer=cfg.conv_cfg.act_layer, + norm_layer=cfg.conv_cfg.norm_layer, + norm_eps=cfg.conv_cfg.norm_eps, + ) + + stride = self.stem.stride + feat_size = tuple([i // s for i, s in zip(img_size, to_2tuple(stride))]) + + num_stages = len(cfg.embed_dim) + assert len(cfg.depths) == num_stages + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)] + in_chs = self.stem.out_chs + stages = [] + for i in range(num_stages): + stage_stride = 2 + out_chs = cfg.embed_dim[i] + feat_size = tuple([(r - 1) // stage_stride + 1 for r in feat_size]) + stages += [MaxxVitStage( + in_chs, + out_chs, + depth=cfg.depths[i], + block_types=cfg.block_type[i], + conv_cfg=cfg.conv_cfg, + transformer_cfg=transformer_cfg, + feat_size=feat_size, + drop_path=dpr[i], + )] + stride *= stage_stride + in_chs = out_chs + self.stages = nn.Sequential(*stages) + + final_norm_layer = partial(get_norm_layer(cfg.transformer_cfg.norm_layer), eps=cfg.transformer_cfg.norm_eps) + if cfg.head_hidden_size: + self.norm = nn.Identity() + self.head = NormMlpHead( + self.num_features, + num_classes, + hidden_size=cfg.head_hidden_size, + pool_type=global_pool, + drop_rate=drop_rate, + norm_layer=final_norm_layer, + ) + else: + # standard classifier head w/ norm, pooling, fc classifier + self.norm = final_norm_layer(self.num_features) + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + # Weight init (default PyTorch init works well for AdamW if scheme not set) + assert cfg.weight_init in ('', 'normal', 'trunc_normal', 'xavier_normal', 'vit_eff') + if cfg.weight_init: + named_apply(partial(self._init_weights, scheme=cfg.weight_init), self) + + def _init_weights(self, module, name, scheme=''): + if hasattr(module, 'init_weights'): + try: + module.init_weights(scheme=scheme) + except TypeError: + module.init_weights() + + @torch.jit.ignore + def no_weight_decay(self): + return { + k for k, _ in self.named_parameters() + if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', # stem and embed + blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is None: + global_pool = self.head.global_pool.pool_type + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) return x -class Stem(nn.Module): +def _rw_coat_cfg( + stride_mode='pool', + pool_type='avg2', + conv_output_bias=False, + conv_attn_early=False, + conv_attn_act_layer='relu', + conv_norm_layer='', + transformer_shortcut_bias=True, + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + init_values=None, + rel_pos_type='bias', + rel_pos_dim=512, +): + # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit + # Common differences for initial timm models: + # - pre-norm layer in MZBConv included an activation after norm + # - mbconv expansion calculated from input instead of output chs + # - mbconv shortcut and final 1x1 conv did not have a bias + # - SE act layer was relu, not silu + # - mbconv uses silu in timm, not gelu + # - expansion in attention block done via output proj, not input proj + # Variable differences (evolved over training initial models): + # - avg pool with kernel_size=2 favoured downsampling (instead of maxpool for coat) + # - SE attention was between conv2 and norm/act + # - default to avg pool for mbconv downsample instead of 1x1 or dw conv + # - transformer block shortcut has no bias + return dict( + conv_cfg=MaxxVitConvCfg( + stride_mode=stride_mode, + pool_type=pool_type, + pre_norm_act=True, + expand_output=False, + output_bias=conv_output_bias, + attn_early=conv_attn_early, + attn_act_layer=conv_attn_act_layer, + act_layer='silu', + norm_layer=conv_norm_layer, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + shortcut_bias=transformer_shortcut_bias, + pool_type=pool_type, + init_values=init_values, + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) - def __init__( - self, - in_chs: int, - out_chs: int, - kernel_size: int = 3, - act_layer: str = 'gelu', - norm_layer: str = 'batchnorm2d', - norm_eps: float = 1e-5, - ): - super().__init__() - if not isinstance(out_chs, (list, tuple)): - out_chs = to_2tuple(out_chs) - norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps) - self.out_chs = out_chs[-1] - self.stride = 2 +def _rw_max_cfg( + stride_mode='dw', + pool_type='avg2', + conv_output_bias=False, + conv_attn_ratio=1 / 16, + conv_norm_layer='', + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + window_size=None, + dim_head=32, + init_values=None, + rel_pos_type='bias', + rel_pos_dim=512, +): + # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit + # Differences of initial timm models: + # - mbconv expansion calculated from input instead of output chs + # - mbconv shortcut and final 1x1 conv did not have a bias + # - mbconv uses silu in timm, not gelu + # - expansion in attention block done via output proj, not input proj + return dict( + conv_cfg=MaxxVitConvCfg( + stride_mode=stride_mode, + pool_type=pool_type, + expand_output=False, + output_bias=conv_output_bias, + attn_ratio=conv_attn_ratio, + act_layer='silu', + norm_layer=conv_norm_layer, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + pool_type=pool_type, + dim_head=dim_head, + window_size=window_size, + init_values=init_values, + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) - self.conv1 = create_conv2d(in_chs, out_chs[0], kernel_size, stride=2) - self.norm1 = norm_act_layer(out_chs[0]) - self.conv2 = create_conv2d(out_chs[0], out_chs[1], kernel_size, stride=1) - def init_weights(self, scheme=''): - named_apply(partial(_init_conv, scheme=scheme), self) +def _next_cfg( + stride_mode='dw', + pool_type='avg2', + conv_norm_layer='layernorm2d', + conv_norm_layer_cl='layernorm', + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + window_size=None, + init_values=1e-6, + rel_pos_type='mlp', # MLP by default for maxxvit + rel_pos_dim=512, +): + # For experimental models with convnext instead of mbconv + init_values = to_2tuple(init_values) + return dict( + conv_cfg=MaxxVitConvCfg( + block_type='convnext', + stride_mode=stride_mode, + pool_type=pool_type, + expand_output=False, + init_values=init_values[0], + norm_layer=conv_norm_layer, + norm_layer_cl=conv_norm_layer_cl, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + pool_type=pool_type, + window_size=window_size, + init_values=init_values[1], + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) - def forward(self, x): - x = self.conv1(x) - x = self.norm1(x) - x = self.conv2(x) - return x + +def _tf_cfg(): + return dict( + conv_cfg=MaxxVitConvCfg( + norm_eps=1e-3, + act_layer='gelu_tanh', + padding='same', + ), + transformer_cfg=MaxxVitTransformerCfg( + norm_eps=1e-5, + act_layer='gelu_tanh', + head_first=False, # heads are interleaved (q_nh, q_hdim, k_nh, q_hdim, ....) + rel_pos_type='bias_tf', + ), + ) -def cfg_window_size(cfg: MaxxVitTransformerCfg, img_size: Tuple[int, int]): - if cfg.window_size is not None: - assert cfg.grid_size - return cfg - partition_size = img_size[0] // cfg.partition_ratio, img_size[1] // cfg.partition_ratio - cfg = replace(cfg, window_size=partition_size, grid_size=partition_size) - return cfg +model_cfgs = dict( + # Fiddling with configs / defaults / still pretraining + coatnet_pico_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 3, 5, 2), + stem_width=(32, 64), + **_rw_max_cfg( # using newer max defaults here + conv_output_bias=True, + conv_attn_ratio=0.25, + ), + ), + coatnet_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + **_rw_max_cfg( # using newer max defaults here + stride_mode='pool', + conv_output_bias=True, + conv_attn_ratio=0.25, + ), + ), + coatnet_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + conv_attn_early=True, + transformer_shortcut_bias=False, + ), + ), + coatnet_1_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_early=True, + transformer_shortcut_bias=False, + ) + ), + coatnet_2_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=(64, 128), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + #init_values=1e-6, + ), + ), + coatnet_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + ), + ), + # Highly experimental configs + coatnet_bn_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_early=True, + transformer_shortcut_bias=False, + transformer_norm_layer='batchnorm2d', + ) + ), + coatnet_rmlp_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + **_rw_max_cfg( + conv_output_bias=True, + conv_attn_ratio=0.25, + rel_pos_type='mlp', + rel_pos_dim=384, + ), + ), + coatnet_rmlp_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + rel_pos_type='mlp', + ), + ), + coatnet_rmlp_1_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=(32, 64), + **_rw_coat_cfg( + pool_type='max', + conv_attn_early=True, + transformer_shortcut_bias=False, + rel_pos_type='mlp', + rel_pos_dim=384, # was supposed to be 512, woops + ), + ), + coatnet_rmlp_1_rw2_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + rel_pos_type='mlp', + rel_pos_dim=512, # was supposed to be 512, woops + ), + ), + coatnet_rmlp_2_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=(64, 128), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), + coatnet_rmlp_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), -class MaxxVit(nn.Module): - """ CoaTNet + MaxVit base model. + coatnet_nano_cc_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + block_type=('C', 'C', ('C', 'T'), ('C', 'T')), + **_rw_coat_cfg(), + ), + coatnext_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + weight_init='normal', + **_next_cfg( + rel_pos_type='bias', + init_values=(1e-5, None) + ), + ), - Highly configurable for different block compositions, tensor layouts, pooling types. - """ + # Trying to be like the CoAtNet paper configs + coatnet_0_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 5, 2), + stem_width=64, + ), + coatnet_1_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=64, + ), + coatnet_2_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=128, + ), + coatnet_3_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=192, + ), + coatnet_4_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 12, 28, 2), + stem_width=192, + ), + coatnet_5_224=MaxxVitCfg( + embed_dim=(256, 512, 1280, 2048), + depths=(2, 12, 28, 2), + stem_width=192, + ), - def __init__( - self, - cfg: MaxxVitCfg, - img_size: Union[int, Tuple[int, int]] = 224, - in_chans: int = 3, - num_classes: int = 1000, - global_pool: str = 'avg', - drop_rate: float = 0., - drop_path_rate: float = 0. - ): - super().__init__() - img_size = to_2tuple(img_size) - transformer_cfg = cfg_window_size(cfg.transformer_cfg, img_size) - self.num_classes = num_classes - self.global_pool = global_pool - self.num_features = cfg.embed_dim[-1] - self.embed_dim = cfg.embed_dim - self.drop_rate = drop_rate - self.grad_checkpointing = False + # Experimental MaxVit configs + maxvit_pico_rw_256=MaxxVitCfg( + embed_dim=(32, 64, 128, 256), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(24, 32), + **_rw_max_cfg(), + ), + maxvit_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + maxvit_tiny_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + maxvit_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), - self.stem = Stem( - in_chs=in_chans, - out_chs=cfg.stem_width, - act_layer=cfg.conv_cfg.act_layer, - norm_layer=cfg.conv_cfg.norm_layer, - norm_eps=cfg.conv_cfg.norm_eps, - ) + maxvit_rmlp_pico_rw_256=MaxxVitCfg( + embed_dim=(32, 64, 128, 256), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(24, 32), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_small_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), + maxvit_rmlp_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), - stride = self.stem.stride - feat_size = tuple([i // s for i, s in zip(img_size, to_2tuple(stride))]) + maxvit_tiny_pm_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('PM',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + + maxxvit_rmlp_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + weight_init='normal', + **_next_cfg(), + ), + maxxvit_rmlp_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_next_cfg(), + ), + maxxvit_rmlp_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(48, 96), + **_next_cfg(), + ), + maxxvit_rmlp_base_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=(48, 96), + **_next_cfg(), + ), + maxxvit_rmlp_large_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 12, 2), + block_type=('M',) * 4, + stem_width=(64, 128), + **_next_cfg(), + ), - num_stages = len(cfg.embed_dim) - assert len(cfg.depths) == num_stages - dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)] - in_chs = self.stem.out_chs - stages = [] - for i in range(num_stages): - stage_stride = 2 - out_chs = cfg.embed_dim[i] - feat_size = tuple([(r - 1) // stage_stride + 1 for r in feat_size]) - stages += [MaxxVitStage( - in_chs, - out_chs, - depth=cfg.depths[i], - block_types=cfg.block_type[i], - conv_cfg=cfg.conv_cfg, - transformer_cfg=transformer_cfg, - feat_size=feat_size, - drop_path=dpr[i], - )] - stride *= stage_stride - in_chs = out_chs - self.stages = nn.Sequential(*stages) + # Trying to be like the MaxViT paper configs + maxvit_tiny_tf=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=64, + stem_bias=True, + head_hidden_size=512, + **_tf_cfg(), + ), + maxvit_small_tf=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=64, + stem_bias=True, + head_hidden_size=768, + **_tf_cfg(), + ), + maxvit_base_tf=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=64, + stem_bias=True, + head_hidden_size=768, + **_tf_cfg(), + ), + maxvit_large_tf=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=128, + stem_bias=True, + head_hidden_size=1024, + **_tf_cfg(), + ), + maxvit_xlarge_tf=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=192, + stem_bias=True, + head_hidden_size=1536, + **_tf_cfg(), + ), +) - final_norm_layer = get_norm_layer(cfg.transformer_cfg.norm_layer) - self.norm = final_norm_layer(self.num_features, eps=cfg.transformer_cfg.norm_eps) - # Classifier head - self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) +def _create_maxxvit(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + MaxxVit, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) - # Weight init (default PyTorch init works well for AdamW if scheme not set) - assert cfg.weight_init in ('', 'normal', 'trunc_normal', 'xavier_normal', 'vit_eff') - if cfg.weight_init: - named_apply(partial(self._init_weights, scheme=cfg.weight_init), self) - def _init_weights(self, module, name, scheme=''): - if hasattr(module, 'init_weights'): - try: - module.init_weights(scheme=scheme) - except TypeError: - module.init_weights() +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.95, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + 'first_conv': 'stem.conv1', 'classifier': 'head.fc', + 'fixed_input_size': True, + **kwargs + } - @torch.jit.ignore - def no_weight_decay(self): - return { - k for k, _ in self.named_parameters() - if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])} - @torch.jit.ignore - def group_matcher(self, coarse=False): - matcher = dict( - stem=r'^stem', # stem and embed - blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))] - ) - return matcher +default_cfgs = generate_defaults({ + # Fiddling with configs / defaults / still pretraining + 'coatnet_pico_rw_224': _cfg(url=''), + 'coatnet_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_nano_rw_224_sw-f53093b4.pth', + crop_pct=0.9), + 'coatnet_0_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_0_rw_224_sw-a6439706.pth'), + 'coatnet_1_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth' + ), + 'coatnet_2_rw_224': _cfg(url=''), + 'coatnet_3_rw_224': _cfg(url=''), - @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): - for s in self.stages: - s.grad_checkpointing = enable + # Highly experimental configs + 'coatnet_bn_0_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_bn_0_rw_224_sw-c228e218.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, + crop_pct=0.95), + 'coatnet_rmlp_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_nano_rw_224_sw-bd1d51b3.pth', + crop_pct=0.9), + 'coatnet_rmlp_0_rw_224': _cfg(url=''), + 'coatnet_rmlp_1_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'), + 'coatnet_rmlp_1_rw2_224': _cfg(url=''), + 'coatnet_rmlp_2_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_2_rw_224_sw-5ccfac55.pth'), + 'coatnet_rmlp_3_rw_224': _cfg(url=''), + 'coatnet_nano_cc_224': _cfg(url=''), + 'coatnext_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnext_nano_rw_224_ad-22cb71c2.pth', + crop_pct=0.9), - @torch.jit.ignore - def get_classifier(self): - return self.head.fc + # Trying to be like the CoAtNet paper configs + 'coatnet_0_224': _cfg(url=''), + 'coatnet_1_224': _cfg(url=''), + 'coatnet_2_224': _cfg(url=''), + 'coatnet_3_224': _cfg(url=''), + 'coatnet_4_224': _cfg(url=''), + 'coatnet_5_224': _cfg(url=''), - def reset_classifier(self, num_classes, global_pool=None): - self.num_classes = num_classes - if global_pool is None: - global_pool = self.head.global_pool.pool_type - self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + # Experimental configs + 'maxvit_pico_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_nano_rw_256_sw-fb127241.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_tiny_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_tiny_rw_224_sw-7d0dffeb.pth'), + 'maxvit_tiny_rw_256': _cfg( + url='', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_pico_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_pico_rw_256_sw-8d82f2c6.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_tiny_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_small_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_small_rw_224_sw-6ef0ae4f.pth', + crop_pct=0.9, + ), + 'maxvit_rmlp_small_rw_256': _cfg( + url='', + input_size=(3, 256, 256), pool_size=(8, 8)), - def forward_features(self, x): - x = self.stem(x) - x = self.stages(x) - x = self.norm(x) - return x + 'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), - def forward_head(self, x, pre_logits: bool = False): - return self.head(x, pre_logits=pre_logits) + 'maxxvit_rmlp_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_nano_rw_256_sw-0325d459.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_rmlp_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_rmlp_small_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_small_rw_256_sw-37e217ff.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_rmlp_base_rw_224': _cfg(url=''), + 'maxxvit_rmlp_large_rw_224': _cfg(url=''), - def forward(self, x): - x = self.forward_features(x) - x = self.forward_head(x) - return x + # Trying to be like the MaxViT paper configs + 'maxvit_tiny_tf_224.in1k': _cfg( + url='', + #file='maxvit_tiny_tf_224_in1k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'maxvit_tiny_tf_384.in1k': _cfg( + url='', + #file='maxvit_tiny_tf_384_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_tiny_tf_512.in1k': _cfg( + url='', + #file='maxvit_tiny_tf_512_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), + 'maxvit_small_tf_224.in1k': _cfg( + url='', + #file='maxvit_small_tf_224_in1k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'maxvit_small_tf_384.in1k': _cfg( + url='', + #file='maxvit_small_tf_384_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_small_tf_512.in1k': _cfg( + url='', + #file='maxvit_small_tf_512_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), + 'maxvit_base_tf_224.in1k': _cfg( + url='', + #file='maxvit_base_tf_224_in1k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'maxvit_base_tf_384.in1k': _cfg( + url='', + #file='maxvit_base_tf_384_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_base_tf_512.in1k': _cfg( + url='', + #file='maxvit_base_tf_512_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), + 'maxvit_large_tf_224.in1k': _cfg( + url='', + #file='maxvit_large_tf_224_in1k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'maxvit_large_tf_384.in1k': _cfg( + url='', + #file='maxvit_large_tf_384_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_large_tf_512.in1k': _cfg( + url='', + #file='maxvit_large_tf_512_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), -def _create_maxxvit(variant, cfg_variant=None, pretrained=False, **kwargs): - return build_model_with_cfg( - MaxxVit, variant, pretrained, - model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], - feature_cfg=dict(flatten_sequential=True), - **kwargs) + 'maxvit_base_tf_224.in21k': _cfg( + url='', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'maxvit_base_tf_384.in21k_ft1k': _cfg( + url='', + #file='maxvit_base_tf_384_in21k_ft_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_base_tf_512.in21k_ft1k': _cfg( + url='', + #file='maxvit_base_tf_512_in21k_ft_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), + 'maxvit_large_tf_224.in21k': _cfg( + url=''), + 'maxvit_large_tf_384.in21k_ft1k': _cfg( + url='', + #file='maxvit_large_tf_384_in21k_ft_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_large_tf_512.in21k_ft1k': _cfg( + url='', + #file='maxvit_large_tf_512_in21k_ft_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), + 'maxvit_xlarge_tf_224.in21k': _cfg( + url=''), + 'maxvit_xlarge_tf_384.in21k_ft1k': _cfg( + url='', + #file='maxvit_xlarge_tf_384_in21k_ft_in1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, crop_mode='squash'), + 'maxvit_xlarge_tf_512.in21k_ft1k': _cfg( + url='', + #file='maxvit_xlarge_tf_512_in21k_ft_in1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, crop_mode='squash'), +}) @register_model @@ -1773,6 +2075,11 @@ def coatnet_rmlp_1_rw_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_rmlp_1_rw_224', pretrained=pretrained, **kwargs) +@register_model +def coatnet_rmlp_1_rw2_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_1_rw2_224', pretrained=pretrained, **kwargs) + + @register_model def coatnet_rmlp_2_rw_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_rmlp_2_rw_224', pretrained=pretrained, **kwargs) @@ -1889,25 +2196,85 @@ def maxxvit_rmlp_small_rw_256(pretrained=False, **kwargs): @register_model -def maxvit_tiny_224(pretrained=False, **kwargs): - return _create_maxxvit('maxvit_tiny_224', pretrained=pretrained, **kwargs) +def maxxvit_rmlp_base_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_rmlp_base_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxxvit_rmlp_large_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_rmlp_large_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_tf_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_tf_224', 'maxvit_tiny_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_tf_384(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_tf_384', 'maxvit_tiny_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_tf_512(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_tf_512', 'maxvit_tiny_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_small_tf_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_small_tf_224', 'maxvit_small_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_small_tf_384(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_small_tf_384', 'maxvit_small_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_small_tf_512(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_small_tf_512', 'maxvit_small_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_base_tf_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_base_tf_224', 'maxvit_base_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_base_tf_384(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_base_tf_384', 'maxvit_base_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_base_tf_512(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_base_tf_512', 'maxvit_base_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_large_tf_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_large_tf_224', 'maxvit_large_tf', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_large_tf_384(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_large_tf_384', 'maxvit_large_tf', pretrained=pretrained, **kwargs) @register_model -def maxvit_small_224(pretrained=False, **kwargs): - return _create_maxxvit('maxvit_small_224', pretrained=pretrained, **kwargs) +def maxvit_large_tf_512(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_large_tf_512', 'maxvit_large_tf', pretrained=pretrained, **kwargs) @register_model -def maxvit_base_224(pretrained=False, **kwargs): - return _create_maxxvit('maxvit_base_224', pretrained=pretrained, **kwargs) +def maxvit_xlarge_tf_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_xlarge_tf_224', 'maxvit_xlarge_tf', pretrained=pretrained, **kwargs) @register_model -def maxvit_large_224(pretrained=False, **kwargs): - return _create_maxxvit('maxvit_large_224', pretrained=pretrained, **kwargs) +def maxvit_xlarge_tf_384(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_xlarge_tf_384', 'maxvit_xlarge_tf', pretrained=pretrained, **kwargs) @register_model -def maxvit_xlarge_224(pretrained=False, **kwargs): - return _create_maxxvit('maxvit_xlarge_224', pretrained=pretrained, **kwargs) \ No newline at end of file +def maxvit_xlarge_tf_512(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_xlarge_tf_512', 'maxvit_xlarge_tf', pretrained=pretrained, **kwargs) \ No newline at end of file diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index f29216c9..cde0018b 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -32,7 +32,7 @@ import torch.utils.checkpoint from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\ OPENAI_CLIP_MEAN, OPENAI_CLIP_STD -from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply, adapt_input_conv, checkpoint_seq +from .helpers import build_model_with_cfg, named_apply, adapt_input_conv, checkpoint_seq from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_ from ._pretrained import generate_defaults from .registry import register_model @@ -795,13 +795,15 @@ default_cfgs = generate_defaults({ mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), 'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg( hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in1k', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, input_size=(3, 336, 336)), + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg( hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), 'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg( hf_hub_id='', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336)), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg( hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k_in1k', @@ -823,13 +825,15 @@ default_cfgs = generate_defaults({ mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), 'vit_large_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg( hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in12k_in1k', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, input_size=(3, 336, 336)), + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k': _cfg( hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), 'vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg( hf_hub_id='timm/vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336)), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', @@ -879,12 +883,16 @@ default_cfgs = generate_defaults({ 'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg( hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), + 'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg( + hf_hub_id='timm/vit_large_patch14_clip_336.openai_ft_in12k_in1k', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', + hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch16_clip_224.openai_ft_in12k', + hf_hub_id='timm/vit_base_patch16_clip_224.openai_ft_in12k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_large_patch14_clip_224.openai_ft_in12k': _cfg( hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k',