From e2a9408dd07142bf888bfee1d036dc9734b60fd0 Mon Sep 17 00:00:00 2001
From: Fredo Guan <fredo.guan@hotmail.com>
Date: Fri, 10 Feb 2023 02:09:35 -0800
Subject: [PATCH] Squashed commit of the following:

commit b7696a30a772dbbb2e00d81e7096c24dac97df73
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Fri Feb 10 01:46:44 2023 -0800

    Update metaformers.py

commit 41fe5c36263b40a6cd7caddb85b10c5d82d48023
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Fri Feb 10 01:03:47 2023 -0800

    Update metaformers.py

commit a3aee37c35985c01ca07902d860f809b648c612c
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Fri Feb 10 00:32:04 2023 -0800

    Update metaformers.py

commit f938beb81b4f46851d6d6f04ae7a9a74871ee40d
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Fri Feb 10 00:24:58 2023 -0800

    Update metaformers.py

commit 10bde717e51c95cdf20135c8bba77a7a1b00d78c
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sun Feb 5 02:11:28 2023 -0800

    Update metaformers.py

commit 39274bd45e78b8ead0509367f800121f0d7c25f4
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sun Feb 5 02:06:58 2023 -0800

    Update metaformers.py

commit a2329ab8ec00d0ebc00979690293c3887cc44a4c
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sun Feb 5 02:03:34 2023 -0800

    Update metaformers.py

commit 53b8ce5b8a6b6d828de61788bcc2e6043ebb3081
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sun Feb 5 02:02:37 2023 -0800

    Update metaformers.py

commit ab6225b9414f534815958036f6d5a392038d7ab2
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sun Feb 5 01:04:55 2023 -0800

    try NHWC

commit 02fcc30eaa67a3c92cae56f3062b2542c32c9283
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sat Feb 4 23:47:06 2023 -0800

    Update metaformers.py

commit 366aae93047934bd3d7d37a077e713d424fa429c
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Sat Feb 4 23:37:30 2023 -0800

    Stem/Downsample rework

commit 26a8e481a5cb2a32004a796bc25c6800cd2fb7b7
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Wed Feb 1 07:42:07 2023 -0800

    Update metaformers.py

commit a913f5d4384aa4b2f62fdab46254ae3772df00ee
Author: Fredo Guan <fredo.guan@hotmail.com>
Date:   Wed Feb 1 07:41:24 2023 -0800

    Update metaformers.py
---
 timm/models/metaformers.py | 204 ++++++++++++++++++++++++++-----------
 1 file changed, 143 insertions(+), 61 deletions(-)

diff --git a/timm/models/metaformers.py b/timm/models/metaformers.py
index f5e12c5a..36263e34 100644
--- a/timm/models/metaformers.py
+++ b/timm/models/metaformers.py
@@ -24,8 +24,11 @@ Adapted from https://github.com/sail-sg/metaformer, original copyright below
 
 from collections import OrderedDict
 from functools import partial
+
 import torch
 import torch.nn as nn
+from torch import Tensor
+
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from timm.layers import trunc_normal_, DropPath, SelectAdaptivePool2d, GroupNorm1
 from timm.layers.helpers import to_2tuple
@@ -40,28 +43,58 @@ from ._registry import register_model
 
 __all__ = ['MetaFormer']
 
+
+class Stem(nn.Module):
+    """
+    Stem implemented by a layer of convolution.
+    Conv2d params constant across all models.
+    """
+    def __init__(self,
+        in_channels, 
+        out_channels, 
+        norm_layer=None, 
+    ):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels, 
+            kernel_size=7, 
+            stride=4, 
+            padding=2
+        )
+        self.norm = norm_layer(out_channels) if norm_layer else nn.Identity()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        # [B, C, H, W]
+        return x
+
 class Downsampling(nn.Module):
     """
     Downsampling implemented by a layer of convolution.
     """
-    def __init__(self, in_channels, out_channels, 
-        kernel_size, stride=1, padding=0, 
-        pre_norm=None, post_norm=None, pre_permute=False):
+    def __init__(self,
+        in_channels, 
+        out_channels, 
+        kernel_size, 
+        stride=1, 
+        padding=0, 
+        norm_layer=None, 
+    ):
         super().__init__()
-        self.pre_norm = pre_norm(in_channels) if pre_norm else nn.Identity()
-        self.pre_permute = pre_permute
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                              stride=stride, padding=padding)
-        self.post_norm = post_norm(out_channels) if post_norm else nn.Identity()
+        self.norm = norm_layer(in_channels) if norm_layer else nn.Identity()
+        self.conv = nn.Conv2d(
+            in_channels, 
+            out_channels, 
+            kernel_size=kernel_size, 
+            stride=stride, 
+            padding=padding
+        )
 
     def forward(self, x):
-        if self.pre_permute:
-            # if take [B, H, W, C] as input, permute it to [B, C, H, W]
-            x = x.permute(0, 3, 1, 2)
-        x = self.pre_norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
-        
+        x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
         x = self.conv(x)
-        x = self.post_norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
         return x
 
 
@@ -299,7 +332,6 @@ class Mlp(nn.Module):
         return x
 
 
-
 class MlpHead(nn.Module):
     """ MLP classification head
     """
@@ -323,7 +355,6 @@ class MlpHead(nn.Module):
         return x
 
 
-
 class MetaFormerBlock(nn.Module):
     """
     Implementation of one MetaFormer block.
@@ -367,7 +398,6 @@ class MetaFormerBlock(nn.Module):
             if res_scale_init_value else nn.Identity()
         
     def forward(self, x):
-        x = x.permute(0, 2, 3, 1)
         x = self.res_scale1(x) + \
             self.layer_scale1(
                 self.drop_path1(
@@ -380,6 +410,69 @@ class MetaFormerBlock(nn.Module):
                     self.mlp(self.norm2(x))
                 )
             )
+        return x
+
+class MetaFormerStage(nn.Module):
+    # implementation of a single metaformer stage
+    def __init__(
+        self,
+        in_chs,
+        out_chs,
+        depth=2,
+        downsample_norm=partial(LayerNormGeneral, bias=False, eps=1e-6),
+        token_mixer=nn.Identity,
+        mlp=Mlp,
+        mlp_fn=nn.Linear,
+        mlp_act=StarReLU,
+        mlp_bias=False,
+        norm_layer=partial(LayerNormGeneral, eps=1e-6, bias=False),
+        dp_rates=[0.]*2,
+        layer_scale_init_value=None,
+        res_scale_init_value=None,
+    ):
+        super().__init__()
+
+        self.grad_checkpointing = False
+        
+        # don't downsample if in_chs and out_chs are the same
+        self.downsample = nn.Identity() if in_chs == out_chs else Downsampling(
+            in_chs,
+            out_chs,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_layer=downsample_norm
+        )
+        
+        self.blocks = nn.Sequential(*[MetaFormerBlock(
+            dim=out_chs,
+            token_mixer=token_mixer,
+            mlp=mlp,
+            mlp_fn=mlp_fn,
+            mlp_act=mlp_act,
+            mlp_bias=mlp_bias,
+            norm_layer=norm_layer,
+            drop_path=dp_rates[i],
+            layer_scale_init_value=layer_scale_init_value,
+            res_scale_init_value=res_scale_init_value
+            ) for i in range(depth)])
+    
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+    
+    # Permute to channels-first for feature extraction
+    def forward(self, x: Tensor):
+        
+        # [B, C, H, W] -> [B, H, W, C]
+        x = self.downsample(x).permute(0, 2, 3, 1)
+        
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        
+        # [B, H, W, C] -> [B, C, H, W]
         x = x.permute(0, 3, 1, 2)
         return x
 
@@ -415,7 +508,7 @@ class MetaFormer(nn.Module):
         token_mixers=nn.Identity,
         mlps=Mlp,
         mlp_fn=nn.Linear,
-        mlp_act = StarReLU,
+        mlp_act=StarReLU,
         mlp_bias=False,
         norm_layers=partial(LayerNormGeneral, eps=1e-6, bias=False),
         drop_path_rate=0.,
@@ -433,24 +526,19 @@ class MetaFormer(nn.Module):
         self.head_fn = head_fn
         self.num_features = dims[-1]
         self.drop_rate = drop_rate
+        self.num_stages = len(depths)
         
+        # convert everything to lists if they aren't indexable
         if not isinstance(depths, (list, tuple)):
             depths = [depths] # it means the model has only one stage
         if not isinstance(dims, (list, tuple)):
             dims = [dims]
-
-        self.num_stages = len(depths)
-
         if not isinstance(token_mixers, (list, tuple)):
             token_mixers = [token_mixers] * self.num_stages
-
         if not isinstance(mlps, (list, tuple)):
             mlps = [mlps] * self.num_stages
-
         if not isinstance(norm_layers, (list, tuple)):
             norm_layers = [norm_layers] * self.num_stages
-        
-
         if not isinstance(layer_scale_init_values, (list, tuple)):
             layer_scale_init_values = [layer_scale_init_values] * self.num_stages
         if not isinstance(res_scale_init_values, (list, tuple)):
@@ -459,47 +547,37 @@ class MetaFormer(nn.Module):
         self.grad_checkpointing = False
         self.feature_info = []
         
-        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
-
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
         
-        self.stem = Downsampling(
+        self.stem = Stem(
             in_chans,
             dims[0], 
-            kernel_size=7, 
-            stride=4, 
-            padding=2,
-            post_norm=downsample_norm
+            norm_layer=downsample_norm
         )
         
         stages = nn.ModuleList() # each stage consists of multiple metaformer blocks
         cur = 0
+        last_dim = dims[0]
         for i in range(self.num_stages):
-            stage = nn.Sequential(OrderedDict([
-                ('downsample', nn.Identity() if i == 0 else Downsampling(
-                    dims[i-1],
-                    dims[i], 
-                    kernel_size=3, 
-                    stride=2, 
-                    padding=1,
-                    pre_norm=downsample_norm,
-                    pre_permute=False
-                )),
-                ('blocks', nn.Sequential(*[MetaFormerBlock(
-                    dim=dims[i],
-                    token_mixer=token_mixers[i],
-                    mlp=mlps[i],
-                    mlp_fn=mlp_fn,
-                    mlp_act=mlp_act,
-                    mlp_bias=mlp_bias,
-                    norm_layer=norm_layers[i],
-                    drop_path=dp_rates[cur + j],
-                    layer_scale_init_value=layer_scale_init_values[i],
-                    res_scale_init_value=res_scale_init_values[i]
-                    ) for j in range(depths[i])])
-                )])
+            stage = MetaFormerStage(
+                last_dim,
+                dims[i],
+                depth=depths[i],
+                downsample_norm=downsample_norm,
+                token_mixer=token_mixers[i],
+                mlp=mlps[i],
+                mlp_fn=mlp_fn,
+                mlp_act=mlp_act,
+                mlp_bias=mlp_bias,
+                norm_layer=norm_layers[i],
+                dp_rates=dp_rates[i],
+                layer_scale_init_value=layer_scale_init_values[i],
+                res_scale_init_value=res_scale_init_values[i],
             )
+            
             stages.append(stage)
             cur += depths[i]
+            last_dim = dims[i]
             self.feature_info += [dict(num_chs=dims[i], reduction=2, module=f'stages.{i}')]
         
         self.stages = nn.Sequential(*stages)
@@ -515,7 +593,7 @@ class MetaFormer(nn.Module):
                 head = self.head_fn(dims[-1], num_classes)
         else:
             head = nn.Identity()
-
+        
         self.norm_pre = output_norm(self.num_features) if head_norm_first else nn.Identity()
         self.head = nn.Sequential(OrderedDict([
                 ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
@@ -534,6 +612,8 @@ class MetaFormer(nn.Module):
     @torch.jit.ignore
     def set_grad_checkpointing(self, enable=True):
         self.grad_checkpointing = enable
+        for stage in self.stages:
+            stage.set_grad_checkpointing(enable=enable)
 
     @torch.jit.ignore
     def get_classifier(self):
@@ -552,23 +632,23 @@ class MetaFormer(nn.Module):
             head = nn.Identity()
         self.head.fc = head
     
-    def forward_head(self, x, pre_logits: bool = False):
+    def forward_head(self, x: Tensor, pre_logits: bool = False):
         # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :(
         x = self.head.global_pool(x)
         x = self.head.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
         x = self.head.flatten(x)
         return x if pre_logits else self.head.fc(x)
         
-    def forward_features(self, x):
+    def forward_features(self, x: Tensor):
         x = self.stem(x)
         if self.grad_checkpointing and not torch.jit.is_scripting():
             x = checkpoint_seq(self.stages, x)
         else:
             x = self.stages(x)
-        x = self.norm_pre(x)
+        x = self.norm_pre(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
         return x 
 
-    def forward(self, x):
+    def forward(self, x: Tensor):
         x = self.forward_features(x)
         x = self.forward_head(x)
         return x
@@ -595,6 +675,8 @@ def checkpoint_filter_fn(state_dict, model):
         k = re.sub(r'([0-9]+).([0-9]+)', r'\1.blocks.\2', k)
         k = k.replace('stages.0.downsample', 'patch_embed')
         k = k.replace('patch_embed', 'stem')
+        k = k.replace('post_norm', 'norm')
+        k = k.replace('pre_norm', 'norm')
         k = re.sub(r'^head', 'head.fc', k)
         k = re.sub(r'^norm', 'head.norm', k)
         out_dict[k] = v
@@ -684,7 +766,7 @@ default_cfgs = generate_default_cfgs({
         classifier='head.fc.fc2'),
     'convformer_s18.sail_in1k_384': _cfg(
         url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_384.pth',
-        classifier='head.fc.fc2', input_size=(3, 384, 384)),
+        classifier='head.fc.fc2', input_size=(3, 384, 384), pool_size=(12,12)),
     'convformer_s18.sail_in22k_ft_in1k': _cfg(
         url='https://huggingface.co/sail/dl/resolve/main/convformer/convformer_s18_in21ft1k.pth',
         classifier='head.fc.fc2'),