|
|
@ -140,11 +140,25 @@ default_cfgs = {
|
|
|
|
num_classes=21843),
|
|
|
|
num_classes=21843),
|
|
|
|
|
|
|
|
|
|
|
|
# SAM trained models (https://arxiv.org/abs/2106.01548)
|
|
|
|
# SAM trained models (https://arxiv.org/abs/2106.01548)
|
|
|
|
'vit_base_patch32_sam_224': _cfg(
|
|
|
|
'vit_base_patch32_224_sam': _cfg(
|
|
|
|
url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz'),
|
|
|
|
url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz'),
|
|
|
|
'vit_base_patch16_sam_224': _cfg(
|
|
|
|
'vit_base_patch16_224_sam': _cfg(
|
|
|
|
url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz'),
|
|
|
|
url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz'),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# DINO pretrained - https://arxiv.org/abs/2104.14294 (no classifier head, for fine-tune only)
|
|
|
|
|
|
|
|
'vit_small_patch16_224_dino': _cfg(
|
|
|
|
|
|
|
|
url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth',
|
|
|
|
|
|
|
|
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
|
|
|
|
|
|
|
|
'vit_small_patch8_224_dino': _cfg(
|
|
|
|
|
|
|
|
url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth',
|
|
|
|
|
|
|
|
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
|
|
|
|
|
|
|
|
'vit_base_patch16_224_dino': _cfg(
|
|
|
|
|
|
|
|
url='https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth',
|
|
|
|
|
|
|
|
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
|
|
|
|
|
|
|
|
'vit_base_patch8_224_dino': _cfg(
|
|
|
|
|
|
|
|
url='https://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth',
|
|
|
|
|
|
|
|
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
|
|
|
|
|
|
|
|
|
|
|
|
# deit models (FB weights)
|
|
|
|
# deit models (FB weights)
|
|
|
|
'deit_tiny_patch16_224': _cfg(
|
|
|
|
'deit_tiny_patch16_224': _cfg(
|
|
|
|
url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth',
|
|
|
|
url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth',
|
|
|
@ -699,26 +713,6 @@ def vit_large_patch16_384(pretrained=False, **kwargs):
|
|
|
|
return model
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch16_sam_224(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/16) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# NOTE original SAM weights release worked with representation_size=768
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, representation_size=0, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch16_sam_224', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch32_sam_224(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/32) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# NOTE original SAM weights release worked with representation_size=768
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, representation_size=0, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch32_sam_224', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
@register_model
|
|
|
|
def vit_huge_patch14_224(pretrained=False, **kwargs):
|
|
|
|
def vit_huge_patch14_224(pretrained=False, **kwargs):
|
|
|
|
""" ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
|
|
|
|
""" ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
|
|
|
@ -851,6 +845,62 @@ def vit_huge_patch14_224_in21k(pretrained=False, **kwargs):
|
|
|
|
return model
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch16_224_sam(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/16) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# NOTE original SAM weights release worked with representation_size=768
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch16_224_sam', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch32_224_sam(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/32) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# NOTE original SAM weights release worked with representation_size=768
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch32_224_sam', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_small_patch16_224_dino(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Small (ViT-S/16) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_small_patch16_224_dino', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_small_patch8_224_dino(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Small (ViT-S/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_small_patch8_224_dino', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch16_224_dino(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/16) /w DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch16_224_dino', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
|
|
|
|
def vit_base_patch8_224_dino(pretrained=False, **kwargs):
|
|
|
|
|
|
|
|
""" ViT-Base (ViT-B/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs)
|
|
|
|
|
|
|
|
model = _create_vision_transformer('vit_base_patch8_224_dino', pretrained=pretrained, **model_kwargs)
|
|
|
|
|
|
|
|
return model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_model
|
|
|
|
@register_model
|
|
|
|
def deit_tiny_patch16_224(pretrained=False, **kwargs):
|
|
|
|
def deit_tiny_patch16_224(pretrained=False, **kwargs):
|
|
|
|
""" DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
|
|
|
|
""" DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877).
|
|
|
|