diff --git a/timm/models/maxxvit.py b/timm/models/maxxvit.py index f10e9f59..1090e755 100644 --- a/timm/models/maxxvit.py +++ b/timm/models/maxxvit.py @@ -82,6 +82,7 @@ default_cfgs = { url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth' ), 'coatnet_2_rw_224': _cfg(url=''), + 'coatnet_3_rw_224': _cfg(url=''), # Highly experimental configs 'coatnet_bn_0_rw_224': _cfg( @@ -94,6 +95,8 @@ default_cfgs = { 'coatnet_rmlp_0_rw_224': _cfg(url=''), 'coatnet_rmlp_1_rw_224': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'), + 'coatnet_rmlp_2_rw_224': _cfg(url=''), + 'coatnet_rmlp_3_rw_224': _cfg(url=''), 'coatnet_nano_cc_224': _cfg(url=''), 'coatnext_nano_rw_224': _cfg(url=''), @@ -122,10 +125,19 @@ default_cfgs = { url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth', input_size=(3, 256, 256), pool_size=(8, 8)), 'maxvit_rmlp_tiny_rw_256': _cfg( - url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-2da819a5.pth', + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pth', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_small_rw_224': _cfg( + url=''), + 'maxvit_rmlp_small_rw_256': _cfg( + url='', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_nano_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_small_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), # Trying to be like the MaxViT paper configs 'maxvit_tiny_224': _cfg(url=''), @@ -182,7 +194,7 @@ class MaxxVitConvCfg: attn_layer: str = 'se' attn_act_layer: str = 'silu' attn_ratio: float = 0.25 - init_values: Optional[float] = 1e-5 # for ConvNeXt block + init_values: Optional[float] = 1e-6 # for ConvNeXt block, ignored by MBConv act_layer: str = 'gelu' norm_layer: str = '' norm_layer_cl: str = '' @@ -218,10 +230,12 @@ def _rw_coat_cfg( pool_type='avg2', conv_output_bias=False, conv_attn_early=False, + conv_attn_act_layer='relu', conv_norm_layer='', transformer_shortcut_bias=True, transformer_norm_layer='layernorm2d', transformer_norm_layer_cl='layernorm', + init_values=None, rel_pos_type='bias', rel_pos_dim=512, ): @@ -246,7 +260,7 @@ def _rw_coat_cfg( expand_output=False, output_bias=conv_output_bias, attn_early=conv_attn_early, - attn_act_layer='relu', + attn_act_layer=conv_attn_act_layer, act_layer='silu', norm_layer=conv_norm_layer, ), @@ -254,6 +268,7 @@ def _rw_coat_cfg( expand_first=False, shortcut_bias=transformer_shortcut_bias, pool_type=pool_type, + init_values=init_values, norm_layer=transformer_norm_layer, norm_layer_cl=transformer_norm_layer_cl, rel_pos_type=rel_pos_type, @@ -272,6 +287,7 @@ def _rw_max_cfg( transformer_norm_layer_cl='layernorm', window_size=None, dim_head=32, + init_values=None, rel_pos_type='bias', rel_pos_dim=512, ): @@ -296,6 +312,7 @@ def _rw_max_cfg( pool_type=pool_type, dim_head=dim_head, window_size=window_size, + init_values=init_values, norm_layer=transformer_norm_layer, norm_layer_cl=transformer_norm_layer_cl, rel_pos_type=rel_pos_type, @@ -312,7 +329,8 @@ def _next_cfg( transformer_norm_layer='layernorm2d', transformer_norm_layer_cl='layernorm', window_size=None, - rel_pos_type='bias', + init_values=1e-6, + rel_pos_type='mlp', # MLP by default for maxxvit rel_pos_dim=512, ): # For experimental models with convnext instead of mbconv @@ -322,6 +340,7 @@ def _next_cfg( stride_mode=stride_mode, pool_type=pool_type, expand_output=False, + init_values=init_values, norm_layer=conv_norm_layer, norm_layer_cl=conv_norm_layer_cl, ), @@ -329,6 +348,7 @@ def _next_cfg( expand_first=False, pool_type=pool_type, window_size=window_size, + init_values=init_values, norm_layer=transformer_norm_layer, norm_layer_cl=transformer_norm_layer_cl, rel_pos_type=rel_pos_type, @@ -381,7 +401,21 @@ model_cfgs = dict( embed_dim=(128, 256, 512, 1024), depths=(2, 6, 14, 2), stem_width=(64, 128), - **_rw_coat_cfg(stride_mode='dw'), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + ), + ), + coatnet_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + ), ), # Highly experimental configs @@ -428,6 +462,29 @@ model_cfgs = dict( rel_pos_dim=384, # was supposed to be 512, woops ), ), + coatnet_rmlp_2_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=(64, 128), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), + coatnet_rmlp_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), + coatnet_nano_cc_224=MaxxVitCfg( embed_dim=(64, 128, 256, 512), depths=(3, 4, 6, 3), @@ -504,6 +561,7 @@ model_cfgs = dict( stem_width=(32, 64), **_rw_max_cfg(), ), + maxvit_rmlp_pico_rw_256=MaxxVitCfg( embed_dim=(32, 64, 128, 256), depths=(2, 2, 5, 2), @@ -525,6 +583,27 @@ model_cfgs = dict( stem_width=(32, 64), **_rw_max_cfg(rel_pos_type='mlp'), ), + maxvit_rmlp_small_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), + maxvit_rmlp_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), + maxvit_tiny_pm_256=MaxxVitCfg( embed_dim=(64, 128, 256, 512), depths=(2, 2, 5, 2), @@ -532,6 +611,7 @@ model_cfgs = dict( stem_width=(32, 64), **_rw_max_cfg(), ), + maxxvit_nano_rw_256=MaxxVitCfg( embed_dim=(64, 128, 256, 512), depths=(1, 2, 3, 1), @@ -540,6 +620,20 @@ model_cfgs = dict( weight_init='normal', **_next_cfg(), ), + maxxvit_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_next_cfg(), + ), + maxxvit_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(48, 96), + **_next_cfg(), + ), # Trying to be like the MaxViT paper configs maxvit_tiny_224=MaxxVitCfg( @@ -1641,6 +1735,11 @@ def coatnet_2_rw_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_2_rw_224', pretrained=pretrained, **kwargs) +@register_model +def coatnet_3_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_3_rw_224', pretrained=pretrained, **kwargs) + + @register_model def coatnet_bn_0_rw_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_bn_0_rw_224', pretrained=pretrained, **kwargs) @@ -1661,6 +1760,16 @@ def coatnet_rmlp_1_rw_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_rmlp_1_rw_224', pretrained=pretrained, **kwargs) +@register_model +def coatnet_rmlp_2_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_2_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_3_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_3_rw_224', pretrained=pretrained, **kwargs) + + @register_model def coatnet_nano_cc_224(pretrained=False, **kwargs): return _create_maxxvit('coatnet_nano_cc_224', pretrained=pretrained, **kwargs) @@ -1736,6 +1845,16 @@ def maxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs): return _create_maxxvit('maxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs) +@register_model +def maxvit_rmlp_small_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_small_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_small_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs) + + @register_model def maxvit_tiny_pm_256(pretrained=False, **kwargs): return _create_maxxvit('maxvit_tiny_pm_256', pretrained=pretrained, **kwargs) @@ -1746,6 +1865,16 @@ def maxxvit_nano_rw_256(pretrained=False, **kwargs): return _create_maxxvit('maxxvit_nano_rw_256', pretrained=pretrained, **kwargs) +@register_model +def maxxvit_tiny_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_tiny_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxxvit_small_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_small_rw_256', pretrained=pretrained, **kwargs) + + @register_model def maxvit_tiny_224(pretrained=False, **kwargs): return _create_maxxvit('maxvit_tiny_224', pretrained=pretrained, **kwargs)