From dff4717cbfceb35e00cff419edff16a5ba44474a Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Wed, 9 Nov 2022 13:35:45 -0800 Subject: [PATCH] Add clip b16 384x384 finetunes --- timm/models/vision_transformer.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index dc2ec97a..2066581a 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -785,7 +785,7 @@ default_cfgs = generate_defaults({ hf_hub_id='timm/vit_base_patch16_clip_224.laion2b_ft_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), 'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg( - #hf_hub_id='timm/vit_base_patch16_clip_384.laion2b_ft_in1k', + hf_hub_id='timm/vit_base_patch16_clip_384.laion2b_ft_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 384, 384)), 'vit_base_patch32_clip_448.laion2b_ft_in1k': _cfg( hf_hub_id='timm/vit_base_patch32_clip_448.laion2b_ft_in1k', @@ -860,6 +860,9 @@ default_cfgs = generate_defaults({ 'vit_base_patch16_clip_224.openai_ft_in1k': _cfg( hf_hub_id='timm/vit_base_patch16_clip_224.openai_ft_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), + 'vit_base_patch16_clip_384.openai_ft_in1k': _cfg( + hf_hub_id='timm/vit_base_patch16_clip_384.openai_ft_in1k', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), 'vit_large_patch14_clip_224.openai_ft_in1k': _cfg( hf_hub_id='timm/vit_large_patch14_clip_224.openai_ft_in1k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), @@ -1175,6 +1178,16 @@ def vit_base_patch16_clip_224(pretrained=False, **kwargs): return model +@register_model +def vit_base_patch16_clip_384(pretrained=False, **kwargs): + """ ViT-B/16 CLIP image tower @ 384x384 + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs) + model = _create_vision_transformer('vit_base_patch16_clip_384', pretrained=pretrained, **model_kwargs) + return model + + @register_model def vit_large_patch14_clip_224(pretrained=False, **kwargs): """ ViT-Large model (ViT-L/14) CLIP image tower