MaxVit, ViT, ConvNeXt, and EfficientNet-v2 updates

* Add support for TF weights and modelling specifics to MaxVit (testing ported weights)
* More fine-tuned CLIP ViT configs
* ConvNeXt and MaxVit updated to new pretrained cfgs use
* EfficientNetV2, MaxVit and ConvNeXt high res models use squash crop/resize
pull/1520/head
Ross Wightman 2 years ago
parent d1e0a4607d
commit ec6921fcb0

@ -21,111 +21,13 @@ from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from .helpers import named_apply, build_model_with_cfg, checkpoint_seq from .helpers import named_apply, build_model_with_cfg, checkpoint_seq
from .layers import trunc_normal_, SelectAdaptivePool2d, DropPath, ConvMlp, Mlp, LayerNorm2d, LayerNorm, \ from .layers import trunc_normal_, SelectAdaptivePool2d, DropPath, ConvMlp, Mlp, LayerNorm2d, LayerNorm, \
create_conv2d, get_act_layer, make_divisible, to_ntuple create_conv2d, get_act_layer, make_divisible, to_ntuple
from ._pretrained import generate_defaults
from .registry import register_model from .registry import register_model
__all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this __all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this
def _cfg(url='', **kwargs):
return {
'url': url,
'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
'crop_pct': 0.875, 'interpolation': 'bicubic',
'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
'first_conv': 'stem.0', 'classifier': 'head.fc',
**kwargs
}
default_cfgs = dict(
# timm specific variants
convnext_atto=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
convnext_atto_ols=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
convnext_femto=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
convnext_femto_ols=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
convnext_pico=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
convnext_pico_ols=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_nano=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_nano_ols=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_tiny_hnf=_cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_tiny=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_small=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_base=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_large=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_tiny_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_small_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_base_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_large_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_xlarge_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
convnext_tiny_384_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
convnext_small_384_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
convnext_base_384_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
convnext_large_384_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
convnext_xlarge_384_in22ft1k=_cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
convnext_tiny_in22k=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841),
convnext_small_in22k=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841),
convnext_base_in22k=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841),
convnext_large_in22k=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841),
convnext_xlarge_in22k=_cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841),
)
class ConvNeXtBlock(nn.Module): class ConvNeXtBlock(nn.Module):
""" ConvNeXt Block """ ConvNeXt Block
There are two equivalent implementations: There are two equivalent implementations:
@ -459,6 +361,107 @@ def _create_convnext(variant, pretrained=False, **kwargs):
return model return model
def _cfg(url='', **kwargs):
return {
'url': url,
'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
'crop_pct': 0.875, 'interpolation': 'bicubic',
'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
'first_conv': 'stem.0', 'classifier': 'head.fc',
**kwargs
}
default_cfgs = generate_defaults({
# timm specific variants
'convnext_atto.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
'convnext_atto_ols.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
'convnext_femto.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
'convnext_femto_ols.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
'convnext_pico.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth',
test_input_size=(3, 288, 288), test_crop_pct=0.95),
'convnext_pico_ols.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_nano.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_nano_ols.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_tiny_hnf.timm_in1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth',
crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_tiny.fb_in1k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_small.fb_in1k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_base.fb_in1k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_large.fb_in1k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_xlarge.untrained': _cfg(),
'convnext_tiny.fb_in22k_ft_in1k': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_small.fb_in22k_ft_in1k': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_base.fb_in22k_ft_in1k': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_large.fb_in22k_ft_in1k': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_xlarge.fb_in22k_ft_in1k': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth',
test_input_size=(3, 288, 288), test_crop_pct=1.0),
'convnext_tiny.fb_in22k_ft_in1k_384': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'convnext_small..fb_in22k_ft_in1k_384': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'convnext_base.fb_in22k_ft_in1k_384': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'convnext_large.fb_in22k_ft_in1k_384': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'convnext_xlarge.fb_in22k_ft_in1k_384': _cfg(
url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth',
input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'convnext_tiny_in22k.fb_in22k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841),
'convnext_small_in22k.fb_in22k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841),
'convnext_base_in22k.fb_in22k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841),
'convnext_large_in22k.fb_in22k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841),
'convnext_xlarge_in22k.fb_in22k': _cfg(
url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841),
})
@register_model @register_model
def convnext_atto(pretrained=False, **kwargs): def convnext_atto(pretrained=False, **kwargs):
# timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
@ -569,105 +572,7 @@ def convnext_large(pretrained=False, **kwargs):
@register_model @register_model
def convnext_tiny_in22ft1k(pretrained=False, **kwargs): def convnext_xlarge(pretrained=False, **kwargs):
model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
model = _create_convnext('convnext_tiny_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_small_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
model = _create_convnext('convnext_small_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_base_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
model = _create_convnext('convnext_base_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_large_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
model = _create_convnext('convnext_large_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_xlarge_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
model = _create_convnext('convnext_xlarge_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_tiny_384_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
model = _create_convnext('convnext_tiny_384_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_small_384_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
model = _create_convnext('convnext_small_384_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_base_384_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
model = _create_convnext('convnext_base_384_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_large_384_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
model = _create_convnext('convnext_large_384_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_xlarge_384_in22ft1k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
model = _create_convnext('convnext_xlarge_384_in22ft1k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_tiny_in22k(pretrained=False, **kwargs):
model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs)
model = _create_convnext('convnext_tiny_in22k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_small_in22k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
model = _create_convnext('convnext_small_in22k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_base_in22k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
model = _create_convnext('convnext_base_in22k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_large_in22k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
model = _create_convnext('convnext_large_in22k', pretrained=pretrained, **model_args)
return model
@register_model
def convnext_xlarge_in22k(pretrained=False, **kwargs):
model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs)
model = _create_convnext('convnext_xlarge_in22k', pretrained=pretrained, **model_args) model = _create_convnext('convnext_xlarge', pretrained=pretrained, **model_args)
return model return model

@ -366,11 +366,11 @@ default_cfgs = {
'tf_efficientnetv2_m': _cfg( 'tf_efficientnetv2_m': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_l': _cfg( 'tf_efficientnetv2_l': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_s_in21ft1k': _cfg( 'tf_efficientnetv2_s_in21ft1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth',
@ -379,15 +379,15 @@ default_cfgs = {
'tf_efficientnetv2_m_in21ft1k': _cfg( 'tf_efficientnetv2_m_in21ft1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_l_in21ft1k': _cfg( 'tf_efficientnetv2_l_in21ft1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_xl_in21ft1k': _cfg( 'tf_efficientnetv2_xl_in21ft1k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_s_in21k': _cfg( 'tf_efficientnetv2_s_in21k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth',
@ -396,15 +396,15 @@ default_cfgs = {
'tf_efficientnetv2_m_in21k': _cfg( 'tf_efficientnetv2_m_in21k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_l_in21k': _cfg( 'tf_efficientnetv2_l_in21k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_xl_in21k': _cfg( 'tf_efficientnetv2_xl_in21k': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth',
mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843,
input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
'tf_efficientnetv2_b0': _cfg( 'tf_efficientnetv2_b0': _cfg(
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth', url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth',

@ -143,3 +143,17 @@ class GELU(nn.Module):
def forward(self, input: torch.Tensor) -> torch.Tensor: def forward(self, input: torch.Tensor) -> torch.Tensor:
return F.gelu(input) return F.gelu(input)
def gelu_tanh(x: torch.Tensor, inplace: bool = False) -> torch.Tensor:
return F.gelu(x, approximate='tanh')
class GELUTanh(nn.Module):
"""Applies the Gaussian Error Linear Units function (w/ dummy inplace arg)
"""
def __init__(self, inplace: bool = False):
super(GELUTanh, self).__init__()
def forward(self, input: torch.Tensor) -> torch.Tensor:
return F.gelu(input, approximate='tanh')

@ -28,6 +28,7 @@ _ACT_FN_DEFAULT = dict(
celu=F.celu, celu=F.celu,
selu=F.selu, selu=F.selu,
gelu=gelu, gelu=gelu,
gelu_tanh=gelu_tanh,
sigmoid=sigmoid, sigmoid=sigmoid,
tanh=tanh, tanh=tanh,
hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid, hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid,
@ -71,6 +72,7 @@ _ACT_LAYER_DEFAULT = dict(
celu=nn.CELU, celu=nn.CELU,
selu=nn.SELU, selu=nn.SELU,
gelu=GELU, gelu=GELU,
gelu_tanh=GELUTanh,
sigmoid=Sigmoid, sigmoid=Sigmoid,
tanh=Tanh, tanh=Tanh,
hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoid, hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoid,

File diff suppressed because it is too large Load Diff

@ -32,7 +32,7 @@ import torch.utils.checkpoint
from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\ from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\
OPENAI_CLIP_MEAN, OPENAI_CLIP_STD OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply, adapt_input_conv, checkpoint_seq from .helpers import build_model_with_cfg, named_apply, adapt_input_conv, checkpoint_seq
from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_ from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_
from ._pretrained import generate_defaults from ._pretrained import generate_defaults
from .registry import register_model from .registry import register_model
@ -795,13 +795,15 @@ default_cfgs = generate_defaults({
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg( 'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in1k', hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in1k',
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, input_size=(3, 336, 336)), mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg( 'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg(
hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in1k', hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in1k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg( 'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg(
hf_hub_id='', hf_hub_id='',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336)), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg( 'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k_in1k', hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k_in1k',
@ -823,13 +825,15 @@ default_cfgs = generate_defaults({
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
'vit_large_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg( 'vit_large_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in12k_in1k', hf_hub_id='timm/vit_large_patch14_clip_336.laion2b_ft_in12k_in1k',
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, input_size=(3, 336, 336)), mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
'vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k': _cfg( 'vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k', hf_hub_id='timm/vit_huge_patch14_clip_224.laion2b_ft_in12k_in1k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
'vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg( 'vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k', hf_hub_id='timm/vit_huge_patch14_clip_336.laion2b_ft_in12k_in1k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336)), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg(
hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
@ -879,12 +883,16 @@ default_cfgs = generate_defaults({
'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg( 'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k', hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k_in1k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_336.openai_ft_in12k_in1k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg(
#hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg(
#hf_hub_id='timm/vit_base_patch16_clip_224.openai_ft_in12k', hf_hub_id='timm/vit_base_patch16_clip_224.openai_ft_in12k',
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
'vit_large_patch14_clip_224.openai_ft_in12k': _cfg( 'vit_large_patch14_clip_224.openai_ft_in12k': _cfg(
hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k', hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in12k',

Loading…
Cancel
Save