diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py index 02c32cb7..f6a09ac2 100644 --- a/timm/models/vision_transformer.py +++ b/timm/models/vision_transformer.py @@ -541,7 +541,11 @@ def _create_vision_transformer(variant, pretrained=False, distilled=False, **kwa @register_model def vit_small_patch16_224(pretrained=False, **kwargs): - """ My custom 'small' ViT model. Depth=8, heads=8= mlp_ratio=3.""" + """ My custom 'small' ViT model. embed_dim=768, depth=8, num_heads=8, mlp_ratio=3. + NOTE: + * this differs from the DeiT based 'small' definitions with embed_dim=384, depth=12, num_heads=6 + * this model does not have a bias for QKV (unlike the official ViT and DeiT models) + """ model_kwargs = dict( patch_size=16, embed_dim=768, depth=8, num_heads=8, mlp_ratio=3., qkv_bias=False, norm_layer=nn.LayerNorm, **kwargs) @@ -994,4 +998,4 @@ def vit_deit_base_distilled_patch16_384(pretrained=False, **kwargs): model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) model = _create_vision_transformer( 'vit_deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs) - return model \ No newline at end of file + return model