From 8ece53e194d83e105603132e127262440cea3a03 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Thu, 22 Dec 2022 21:43:04 -0800 Subject: [PATCH] Switch BEiT to HF hub weights --- timm/models/beit.py | 36 ++++++++++++++++++++----------- timm/models/vision_transformer.py | 4 ++++ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/timm/models/beit.py b/timm/models/beit.py index de71f441..12ec493d 100644 --- a/timm/models/beit.py +++ b/timm/models/beit.py @@ -355,64 +355,76 @@ def _cfg(url='', **kwargs): default_cfgs = generate_default_cfgs({ 'beit_base_patch16_224.in22k_ft_in22k_in1k': _cfg( - url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'), + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth', + hf_hub_id='timm/'), 'beit_base_patch16_384.in22k_ft_in22k_in1k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth', + hf_hub_id='timm/', input_size=(3, 384, 384), crop_pct=1.0, ), 'beit_base_patch16_224.in22k_ft_in22k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth', + hf_hub_id='timm/', num_classes=21841, ), 'beit_large_patch16_224.in22k_ft_in22k_in1k': _cfg( - url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'), + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth', + hf_hub_id='timm/'), 'beit_large_patch16_384.in22k_ft_in22k_in1k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth', + hf_hub_id='timm/', input_size=(3, 384, 384), crop_pct=1.0, ), 'beit_large_patch16_512.in22k_ft_in22k_in1k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth', + hf_hub_id='timm/', input_size=(3, 512, 512), crop_pct=1.0, ), 'beit_large_patch16_224.in22k_ft_in22k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth', + hf_hub_id='timm/', num_classes=21841, ), 'beitv2_base_patch16_224.in1k_ft_in22k_in1k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth', + hf_hub_id='timm/', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD ), 'beitv2_base_patch16_224.in1k_ft_in22k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth', - num_classes=21841, - mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + hf_hub_id='timm/', + num_classes=21841, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD ), 'beitv2_large_patch16_224.in1k_ft_in22k_in1k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth', - crop_pct=0.95, - mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + hf_hub_id='timm/', + crop_pct=0.95, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD ), 'beitv2_large_patch16_224.in1k_ft_in22k': _cfg( url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth', - num_classes=21841, - mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + hf_hub_id='timm/', + num_classes=21841, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD ), 'eva_giant_patch14_224.clip_ft_in1k': _cfg( - hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz224_ftcls_89p1.pt', + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz224_ftcls_89p1.pt', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, ), 'eva_giant_patch14_336.clip_ft_in1k': _cfg( - hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz336_ftcls_89p4.pt', + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_clip_vis_enc_sz336_ftcls_89p4.pt', + hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'), 'eva_giant_patch14_336.m30m_ft_in22k_in1k': _cfg( - hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_336px_psz14_ema_89p6.pt', + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_336px_psz14_ema_89p6.pt', + hf_hub_id='timm/', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'), 'eva_giant_patch14_560.m30m_ft_in22k_in1k': _cfg( - hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_560px_psz14_ema_89p7.pt', + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_21k_1k_560px_psz14_ema_89p7.pt', + hf_hub_id='timm/', mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, input_size=(3, 560, 560), crop_pct=1.0, crop_mode='squash'), }) diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 2b4be95c..d6865549 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -1052,18 +1052,22 @@ default_cfgs = generate_default_cfgs({ # EVA fine-tuned weights from MAE style MIM - EVA-CLIP target pretrain # https://github.com/baaivision/EVA/blob/7ecf2c0a370d97967e86d047d7af9188f78d2df3/eva/README.md#eva-l-learning-better-mim-representations-from-eva-clip 'eva_large_patch14_196.in22k_ft_in22k_in1k': _cfg( + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_196px_21k_to_1k_ft_88p6.pt', hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 196, 196), crop_pct=1.0), 'eva_large_patch14_336.in22k_ft_in22k_in1k': _cfg( + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_336px_21k_to_1k_ft_89p2.pt', hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'), 'eva_large_patch14_196.in22k_ft_in1k': _cfg( + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_196px_1k_ft_88p0.pt', hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 196, 196), crop_pct=1.0), 'eva_large_patch14_336.in22k_ft_in1k': _cfg( + # hf_hub_id='BAAI/EVA', hf_hub_filename='eva_l_psz14_336px_1k_ft_88p65.pt', hf_hub_id='timm/', mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_size=(3, 336, 336), crop_pct=1.0, crop_mode='squash'),