From d73f548b0f06f42441ae2ce85a649fd97f19f213 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 6 Jan 2023 14:35:26 -0800 Subject: [PATCH] Re-order vit pretrained entries for more sensible default weights (no .tag specified) --- timm/models/vision_transformer.py | 178 +++++++++++++++--------------- 1 file changed, 88 insertions(+), 90 deletions(-) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 9441a3b2..8ffb1200 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -697,6 +697,13 @@ def _cfg(url='', **kwargs): default_cfgs = generate_default_cfgs({ + # re-finetuned augreg 21k FT on in1k weights + 'vit_base_patch16_224.augreg2_in21k_ft_in1k': _cfg( + hf_hub_id='timm/'), + 'vit_base_patch16_384.augreg2_in21k_ft_in1k': _cfg(), + 'vit_base_patch8_224.augreg2_in21k_ft_in1k': _cfg( + hf_hub_id='timm/'), + # How to train your ViT (augreg) weights, pretrained on 21k FT on in1k 'vit_tiny_patch16_224.augreg_in21k_ft_in1k': _cfg( url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz', @@ -751,13 +758,6 @@ default_cfgs = generate_default_cfgs({ hf_hub_id='timm/', custom_load=True, input_size=(3, 384, 384), crop_pct=1.0), - # re-finetuned augreg 21k FT on in1k weights - 'vit_base_patch16_224.augreg2_in21k_ft_in1k': _cfg( - hf_hub_id='timm/'), - 'vit_base_patch16_384.augreg2_in21k_ft_in1k': _cfg(), - 'vit_base_patch8_224.augreg2_in21k_ft_in1k': _cfg( - hf_hub_id='timm/'), - # patch models (weights from official Google JAX impl) pretrained on in21k FT on in1k 'vit_base_patch16_224.orig_in21k_ft_in1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth', @@ -802,7 +802,6 @@ default_cfgs = generate_default_cfgs({ 'vit_giant_patch14_224.untrained': _cfg(url=''), 'vit_gigantic_patch14_224.untrained': _cfg(url=''), - # patch models, imagenet21k (weights from official Google JAX impl) 'vit_large_patch32_224.orig_in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth', @@ -869,7 +868,6 @@ default_cfgs = generate_default_cfgs({ hf_hub_id='timm/', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), - # ViT ImageNet-21K-P pretraining by MILL 'vit_base_patch16_224_miil.in21k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth', @@ -880,7 +878,7 @@ default_cfgs = generate_default_cfgs({ hf_hub_id='timm/', mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear'), - # custom timm variants + # Custom timm variants 'vit_base_patch16_rpn_224.in1k': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth', hf_hub_id='timm/'), @@ -896,52 +894,6 @@ default_cfgs = generate_default_cfgs({ 'vit_base_patch16_gap_224': _cfg(), # CLIP pretrained image tower and related fine-tuned weights - 'vit_base_patch32_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), - 'vit_base_patch16_clip_224.laion2b': _cfg( - #hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K', - hf_hub_filename='open_clip_pytorch_model.bin', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), - 'vit_large_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', - hf_hub_filename='open_clip_pytorch_model.bin', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768), - 'vit_huge_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', - hf_hub_filename='open_clip_pytorch_model.bin', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), - 'vit_giant_patch14_clip_224.laion2b': _cfg( - hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', - hf_hub_filename='open_clip_pytorch_model.bin', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), - - 'vit_base_patch32_clip_224.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), - 'vit_base_patch16_clip_224.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'), - 'vit_large_patch14_clip_224.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), - 'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, - crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), - 'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg( - hf_hub_id='', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), - 'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg( hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), @@ -973,28 +925,52 @@ default_cfgs = generate_default_cfgs({ mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), - 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), - 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg( + 'vit_base_patch32_clip_224.openai_ft_in12k_in1k': _cfg( + # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), + 'vit_base_patch32_clip_384.openai_ft_in12k_in1k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), - 'vit_large_patch14_clip_224.laion2b_ft_in12k': _cfg( + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'), + 'vit_base_patch16_clip_224.openai_ft_in12k_in1k': _cfg( hf_hub_id='timm/', - mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=11821), - 'vit_huge_patch14_clip_224.laion2b_ft_in12k': _cfg( + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95), + 'vit_base_patch16_clip_384.openai_ft_in12k_in1k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'), + 'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), + 'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), - 'vit_base_patch32_clip_224.openai': _cfg( + 'vit_base_patch32_clip_224.laion2b_ft_in1k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), - 'vit_base_patch16_clip_224.openai': _cfg( + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), + 'vit_base_patch16_clip_224.laion2b_ft_in1k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), - 'vit_large_patch14_clip_224.openai': _cfg( + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), + 'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'), + 'vit_large_patch14_clip_224.laion2b_ft_in1k': _cfg( + hf_hub_id='timm/', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0), + 'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg( + hf_hub_id='timm/', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), + 'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), + 'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg( + hf_hub_id='', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, + crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), 'vit_base_patch32_clip_224.openai_ft_in1k': _cfg( hf_hub_id='timm/', @@ -1010,30 +986,21 @@ default_cfgs = generate_default_cfgs({ hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_base_patch32_clip_224.openai_ft_in12k_in1k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD), - 'vit_base_patch32_clip_384.openai_ft_in12k_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'), - 'vit_base_patch16_clip_224.openai_ft_in12k_in1k': _cfg( - hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95), - 'vit_base_patch16_clip_384.openai_ft_in12k_in1k': _cfg( + 'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg( + #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), + 'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'), - 'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg( + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), + 'vit_large_patch14_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0), - 'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg( + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=11821), + 'vit_huge_patch14_clip_224.laion2b_ft_in12k': _cfg( hf_hub_id='timm/', - mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, - crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'), + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), 'vit_base_patch32_clip_224.openai_ft_in12k': _cfg( - #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', + # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821), 'vit_base_patch16_clip_224.openai_ft_in12k': _cfg( hf_hub_id='timm/', @@ -1042,6 +1009,37 @@ default_cfgs = generate_default_cfgs({ hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821), + 'vit_base_patch32_clip_224.laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), + 'vit_base_patch16_clip_224.laion2b': _cfg( + # hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512), + 'vit_large_patch14_clip_224.laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768), + 'vit_huge_patch14_clip_224.laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + 'vit_giant_patch14_clip_224.laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024), + + 'vit_base_patch32_clip_224.openai': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), + 'vit_base_patch16_clip_224.openai': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), + 'vit_large_patch14_clip_224.openai': _cfg( + hf_hub_id='timm/', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768), + # experimental (may be removed) 'vit_base_patch32_plus_256': _cfg(url='', input_size=(3, 256, 256), crop_pct=0.95), 'vit_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240), crop_pct=0.95),