From 5220711d8727d87ac21312443df64a99750fe046 Mon Sep 17 00:00:00 2001 From: Martins Bruveris Date: Sun, 14 Nov 2021 11:01:48 +0000 Subject: [PATCH] Added B/8 models to ViT. --- timm/models/vision_transformer.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 94ae2666..6e568abf 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -88,6 +88,9 @@ default_cfgs = { url='https://storage.googleapis.com/vit_models/augreg/' 'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz', input_size=(3, 384, 384), crop_pct=1.0), + 'vit_base_patch8_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'), 'vit_large_patch32_224': _cfg( url='', # no official model weights for this combo, only for in21k ), @@ -118,6 +121,9 @@ default_cfgs = { 'vit_base_patch16_224_in21k': _cfg( url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz', num_classes=21843), + 'vit_base_patch8_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz', + num_classes=21843), 'vit_large_patch32_224_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth', num_classes=21843), @@ -640,6 +646,16 @@ def vit_base_patch16_384(pretrained=False, **kwargs): return model +@register_model +def vit_base_patch8_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **model_kwargs) + return model + + @register_model def vit_large_patch32_224(pretrained=False, **kwargs): """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights. @@ -756,6 +772,18 @@ def vit_base_patch16_224_in21k(pretrained=False, **kwargs): return model +@register_model +def vit_base_patch8_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict( + patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch8_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + @register_model def vit_large_patch32_224_in21k(pretrained=False, **kwargs): """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).