Squashed commit of the following:

commit b7696a30a772dbbb2e00d81e7096c24dac97df73 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Fri Feb 10 01:46:44 2023 -0800 Update metaformers.py commit 41fe5c36263b40a6cd7caddb85b10c5d82d48023 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Fri Feb 10 01:03:47 2023 -0800 Update metaformers.py commit a3aee37c35985c01ca07902d860f809b648c612c Author: Fredo Guan <fredo.guan@hotmail.com> Date: Fri Feb 10 00:32:04 2023 -0800 Update metaformers.py commit f938beb81b4f46851d6d6f04ae7a9a74871ee40d Author: Fredo Guan <fredo.guan@hotmail.com> Date: Fri Feb 10 00:24:58 2023 -0800 Update metaformers.py commit 10bde717e51c95cdf20135c8bba77a7a1b00d78c Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sun Feb 5 02:11:28 2023 -0800 Update metaformers.py commit 39274bd45e78b8ead0509367f800121f0d7c25f4 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sun Feb 5 02:06:58 2023 -0800 Update metaformers.py commit a2329ab8ec00d0ebc00979690293c3887cc44a4c Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sun Feb 5 02:03:34 2023 -0800 Update metaformers.py commit 53b8ce5b8a6b6d828de61788bcc2e6043ebb3081 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sun Feb 5 02:02:37 2023 -0800 Update metaformers.py commit ab6225b9414f534815958036f6d5a392038d7ab2 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sun Feb 5 01:04:55 2023 -0800 try NHWC commit 02fcc30eaa67a3c92cae56f3062b2542c32c9283 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sat Feb 4 23:47:06 2023 -0800 Update metaformers.py commit 366aae93047934bd3d7d37a077e713d424fa429c Author: Fredo Guan <fredo.guan@hotmail.com> Date: Sat Feb 4 23:37:30 2023 -0800 Stem/Downsample rework commit 26a8e481a5cb2a32004a796bc25c6800cd2fb7b7 Author: Fredo Guan <fredo.guan@hotmail.com> Date: Wed Feb 1 07:42:07 2023 -0800 Update metaformers.py commit a913f5d4384aa4b2f62fdab46254ae3772df00ee Author: Fredo Guan <fredo.guan@hotmail.com> Date: Wed Feb 1 07:41:24 2023 -0800 Update metaformers.py
2 years ago · e2a9408dd0
parent 0b1f84142f
commit e2a9408dd0
1 changed files with 143 additions and 61 deletions
--- a/timm/models/metaformers.py
+++ b/timm/models/metaformers.py
@ -24,8 +24,11 @@ Adapted from https://github.com/sail-sg/metaformer, original copyright below

 from collections import OrderedDict
 from functools import partial
+
 import torch
 import torch.nn as nn
+from torch import Tensor
+
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from timm.layers import trunc_normal_, DropPath, SelectAdaptivePool2d, GroupNorm1
 from timm.layers.helpers import to_2tuple
@ -40,28 +43,58 @@ from ._registry import register_model

 __all__ = ['MetaFormer']

+
+class Stem(nn.Module):
+    """
+    Stem implemented by a layer of convolution.
+    Conv2d params constant across all models.
+    """
+    def __init__(self,
+        in_channels, 
+        out_channels, 
+        norm_layer=None, 
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels, 
+            kernel_size=7, 
+            stride=4, 
+            padding=2
+        )
+        self.norm = norm_layer(out_channels) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        # [B, C, H, W]
+        return x
+
 class Downsampling(nn.Module):
    """
    Downsampling implemented by a layer of convolution.
    """
-    def __init__(self, in_channels, out_channels, 
-        kernel_size, stride=1, padding=0, 
-        pre_norm=None, post_norm=None, pre_permute=False):
+    def __init__(self,
+        in_channels, 
+        out_channels, 
+        kernel_size, 
+        stride=1, 
+        padding=0, 
+        norm_layer=None, 
+    ):
        super().__init__()
-        self.pre_norm = pre_norm(in_channels) if pre_norm else nn.Identity()
-        self.pre_permute = pre_permute
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                              stride=stride, padding=padding)
-        self.post_norm = post_norm(out_channels) if post_norm else nn.Identity()
+        self.norm = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.conv = nn.Conv2d(
+            in_channels, 
+            out_channels, 
+            kernel_size=kernel_size, 
+            stride=stride, 
+            padding=padding
+        )

    def forward(self, x):
-        if self.pre_permute:
-            # if take [B, H, W, C] as input, permute it to [B, C, H, W]
-            x = x.permute(0, 3, 1, 2)
-        x = self.pre_norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        x = self.conv(x)
-        x = self.post_norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        return x


@ -299,7 +332,6 @@ class Mlp(nn.Module):
        return x


-
 class MlpHead(nn.Module):
    """ MLP classification head
    """
@ -323,7 +355,6 @@ class MlpHead(nn.Module):
        return x


-
 class MetaFormerBlock(nn.Module):
    """
    Implementation of one MetaFormer block.
@ -367,7 +398,6 @@ class MetaFormerBlock(nn.Module):
            if res_scale_init_value else nn.Identity()
        
    def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
        x = self.res_scale1(x) + \
            self.layer_scale1(
                self.drop_path1(
@ -380,6 +410,69 @@ class MetaFormerBlock(nn.Module):
                    self.mlp(self.norm2(x))
                )
            )
+        return x
+
+class MetaFormerStage(nn.Module):
+    # implementation of a single metaformer stage
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        depth=2,
+        downsample_norm=partial(LayerNormGeneral, bias=False, eps=1e-6),
+        token_mixer=nn.Identity,
+        mlp=Mlp,
+        mlp_fn=nn.Linear,
+        mlp_act=StarReLU,
+        mlp_bias=False,
+        norm_layer=partial(LayerNormGeneral, eps=1e-6, bias=False),
+        dp_rates=[0.]*2,
+        layer_scale_init_value=None,
+        res_scale_init_value=None,
+    ):
+        super().__init__()
+
+        self.grad_checkpointing = False
+        
+        # don't downsample if in_chs and out_chs are the same
+        self.downsample = nn.Identity() if in_chs == out_chs else Downsampling(
+            in_chs,
+            out_chs,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_layer=downsample_norm
+        )
+        
+        self.blocks = nn.Sequential(*[MetaFormerBlock(
+            dim=out_chs,
+            token_mixer=token_mixer,
+            mlp=mlp,
+            mlp_fn=mlp_fn,
+            mlp_act=mlp_act,
+            mlp_bias=mlp_bias,
+            norm_layer=norm_layer,
+            drop_path=dp_rates[i],
+            layer_scale_init_value=layer_scale_init_value,
+            res_scale_init_value=res_scale_init_value
+            ) for i in range(depth)])
+    
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    
+    # Permute to channels-first for feature extraction
+    def forward(self, x: Tensor):
+        
+        # [B, C, H, W] -> [B, H, W, C]
+        x = self.downsample(x).permute(0, 2, 3, 1)
+        
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        
+        # [B, H, W, C] -> [B, C, H, W]
        x = x.permute(0, 3, 1, 2)
        return x

@ -415,7 +508,7 @@ class MetaFormer(nn.Module):
        token_mixers=nn.Identity,
        mlps=Mlp,
        mlp_fn=nn.Linear,
-        mlp_act = StarReLU,
+        mlp_act=StarReLU,
        mlp_bias=False,
        norm_layers=partial(LayerNormGeneral, eps=1e-6, bias=False),
        drop_path_rate=0.,
@ -433,24 +526,19 @@ class MetaFormer(nn.Module):
        self.head_fn = head_fn
        self.num_features = dims[-1]
        self.drop_rate = drop_rate
+        self.num_stages = len(depths)
        
+        # convert everything to lists if they aren't indexable
        if not isinstance(depths, (list, tuple)):
            depths = [depths] # it means the model has only one stage
        if not isinstance(dims, (list, tuple)):
            dims = [dims]
-
-        self.num_stages = len(depths)
-
        if not isinstance(token_mixers, (list, tuple)):
            token_mixers = [token_mixers] * self.num_stages
-
        if not isinstance(mlps, (list, tuple)):
            mlps = [mlps] * self.num_stages
-
        if not isinstance(norm_layers, (list, tuple)):
            norm_layers = [norm_layers] * self.num_stages
-        
-
        if not isinstance(layer_scale_init_values, (list, tuple)):
            layer_scale_init_values = [layer_scale_init_values] * self.num_stages
        if not isinstance(res_scale_init_values, (list, tuple)):
@ -459,47 +547,37 @@ class MetaFormer(nn.Module):
        self.grad_checkpointing = False
        self.feature_info = []
        
-        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
        
-        self.stem = Downsampling(
+        self.stem = Stem(
            in_chans,
            dims[0], 
-            kernel_size=7, 
-            stride=4, 
-            padding=2,
-            post_norm=downsample_norm
+            norm_layer=downsample_norm
        )
        
        stages = nn.ModuleList() # each stage consists of multiple metaformer blocks
        cur = 0
+        last_dim = dims[0]
        for i in range(self.num_stages):
-            stage = nn.Sequential(OrderedDict([
-                ('downsample', nn.Identity() if i == 0 else Downsampling(
-                    dims[i-1],
-                    dims[i], 
-                    kernel_size=3, 
-                    stride=2, 
-                    padding=1,
-                    pre_norm=downsample_norm,
-                    pre_permute=False
-                )),
-                ('blocks', nn.Sequential(*[MetaFormerBlock(
-                    dim=dims[i],
-                    token_mixer=token_mixers[i],
-                    mlp=mlps[i],
-                    mlp_fn=mlp_fn,
-                    mlp_act=mlp_act,
-                    mlp_bias=mlp_bias,
-                    norm_layer=norm_layers[i],
-                    drop_path=dp_rates[cur + j],
-                    layer_scale_init_value=layer_scale_init_values[i],
-                    res_scale_init_value=res_scale_init_values[i]
-                    ) for j in range(depths[i])])
-                )])
+            stage = MetaFormerStage(
+                last_dim,
+                dims[i],
+                depth=depths[i],
+                downsample_norm=downsample_norm,
+                token_mixer=token_mixers[i],
+                mlp=mlps[i],
+                mlp_fn=mlp_fn,
+                mlp_act=mlp_act,
+                mlp_bias=mlp_bias,
+                norm_layer=norm_layers[i],
+                dp_rates=dp_rates[i],
+                layer_scale_init_value=layer_scale_init_values[i],
+                res_scale_init_value=res_scale_init_values[i],
            )
+            
            stages.append(stage)
            cur += depths[i]
+            last_dim = dims[i]
            self.feature_info += [dict(num_chs=dims[i], reduction=2, module=f'stages.{i}')]
        
        self.stages = nn.Sequential(*stages)
@ -515,7 +593,7 @@ class MetaFormer(nn.Module):
                head = self.head_fn(dims[-1], num_classes)
        else:
            head = nn.Identity()
-
+        
        self.norm_pre = output_norm(self.num_features) if head_norm_first else nn.Identity()
        self.head = nn.Sequential(OrderedDict([
                ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
@ -534,6 +612,8 @@ class MetaFormer(nn.Module):
    @torch.jit.ignore
    def set_grad_checkpointing(self, enable=True):
        self.grad_checkpointing = enable
+        for stage in self.stages:
+            stage.set_grad_checkpointing(enable=enable)

    @torch.jit.ignore
    def get_classifier(self):
@ -552,23 +632,23 @@ class MetaFormer(nn.Module):
            head = nn.Identity()
        self.head.fc = head
    
-    def forward_head(self, x, pre_logits: bool = False):
+    def forward_head(self, x: Tensor, pre_logits: bool = False):
        # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :(
        x = self.head.global_pool(x)
        x = self.head.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        x = self.head.flatten(x)
        return x if pre_logits else self.head.fc(x)
        
-    def forward_features(self, x):
+    def forward_features(self, x: Tensor):
        x = self.stem(x)
        if self.grad_checkpointing and not torch.jit.is_scripting():
            x = checkpoint_seq(self.stages, x)
        else:
            x = self.stages(x)
-        x = self.norm_pre(x)
+        x = self.norm_pre(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        return x 

-    def forward(self, x):
+    def forward(self, x: Tensor):
        x = self.forward_features(x)
        x = self.forward_head(x)
        return x
@ -595,6 +675,8 @@ def checkpoint_filter_fn(state_dict, model):
        k = re.sub(r'([0-9]+).([0-9]+)', r'\1.blocks.\2', k)
        k = k.replace('stages.0.downsample', 'patch_embed')
        k = k.replace('patch_embed', 'stem')
+        k = k.replace('post_norm', 'norm')
+        k = k.replace('pre_norm', 'norm')
        k = re.sub(r'^head', 'head.fc', k)
        k = re.sub(r'^norm', 'head.norm', k)
        out_dict[k] = v
@ -684,7 +766,7 @@ default_cfgs = generate_default_cfgs({
        classifier='head.fc.fc2'),
    'convformer_s18.sail_in1k_384': _cfg(
        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_384.pth',
-        classifier='head.fc.fc2', input_size=(3, 384, 384)),
+        classifier='head.fc.fc2', input_size=(3, 384, 384), pool_size=(12,12)),
    'convformer_s18.sail_in22k_ft_in1k': _cfg(
        url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_in21ft1k.pth',
        classifier='head.fc.fc2'),