From 9c78de8c024bff0acc68b044dfb935366c6185dc Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Wed, 26 May 2021 15:28:42 -0700
Subject: [PATCH] Fix #661, move hardswish out of default args for LeViT.
 Enable native torch support for hardswish, hardsigmoid, mish if present.

---
 tests/test_layers.py                | 12 ++---
 tests/test_models.py                |  2 +-
 timm/models/efficientnet_blocks.py  |  6 +--
 timm/models/efficientnet_builder.py |  5 +-
 timm/models/ghostnet.py             |  4 +-
 timm/models/layers/create_act.py    | 74 ++++++++++++++++++-----------
 timm/models/layers/se.py            |  2 +-
 timm/models/levit.py                |  8 ++--
 8 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/tests/test_layers.py b/tests/test_layers.py
index 714cb444..508a6aae 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -8,10 +8,10 @@ from timm.models.layers import create_act_layer, get_act_layer, set_layer_config
 
 
 class MLP(nn.Module):
-    def __init__(self, act_layer="relu"):
+    def __init__(self, act_layer="relu", inplace=True):
         super(MLP, self).__init__()
         self.fc1 = nn.Linear(1000, 100)
-        self.act = create_act_layer(act_layer, inplace=True)
+        self.act = create_act_layer(act_layer, inplace=inplace)
         self.fc2 = nn.Linear(100, 10)
 
     def forward(self, x):
@@ -21,14 +21,14 @@ class MLP(nn.Module):
         return x
 
 
-def _run_act_layer_grad(act_type):
+def _run_act_layer_grad(act_type, inplace=True):
     x = torch.rand(10, 1000) * 10
-    m = MLP(act_layer=act_type)
+    m = MLP(act_layer=act_type, inplace=inplace)
 
     def _run(x, act_layer=''):
         if act_layer:
             # replace act layer if set
-            m.act = create_act_layer(act_layer, inplace=True)
+            m.act = create_act_layer(act_layer, inplace=inplace)
         out = m(x)
         l = (out - 0).pow(2).sum()
         return l
@@ -58,7 +58,7 @@ def test_mish_grad():
 
 def test_hard_sigmoid_grad():
     for _ in range(100):
-        _run_act_layer_grad('hard_sigmoid')
+        _run_act_layer_grad('hard_sigmoid', inplace=None)
 
 
 def test_hard_swish_grad():
diff --git a/tests/test_models.py b/tests/test_models.py
index 44cb3ba2..18298dff 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -110,7 +110,7 @@ def test_model_backward(model_name, batch_size):
     assert not torch.isnan(outputs).any(), 'Output included NaNs'
 
 
-@pytest.mark.timeout(120)
+@pytest.mark.timeout(300)
 @pytest.mark.parametrize('model_name', list_models(exclude_filters=NON_STD_FILTERS))
 @pytest.mark.parametrize('batch_size', [1])
 def test_model_default_cfgs(model_name, batch_size):
diff --git a/timm/models/efficientnet_blocks.py b/timm/models/efficientnet_blocks.py
index 83b57beb..7853db0e 100644
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
 
-from .layers import create_conv2d, drop_path, make_divisible
+from .layers import create_conv2d, drop_path, make_divisible, get_act_fn, create_act_layer
 from .layers.activations import sigmoid
 
 __all__ = [
@@ -36,9 +36,9 @@ class SqueezeExcite(nn.Module):
         reduced_chs = make_divisible(reduced_chs * se_ratio, divisor)
         act_layer = force_act_layer or act_layer
         self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
-        self.act1 = act_layer(inplace=True)
+        self.act1 = create_act_layer(act_layer, inplace=True)
         self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
-        self.gate_fn = gate_fn
+        self.gate_fn = get_act_fn(gate_fn)
 
     def forward(self, x):
         x_se = x.mean((2, 3), keepdim=True)
diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py
index 30739454..57e2039b 100644
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@@ -50,10 +50,7 @@ def resolve_bn_args(kwargs):
 
 
 def resolve_act_layer(kwargs, default='relu'):
-    act_layer = kwargs.pop('act_layer', default)
-    if isinstance(act_layer, str):
-        act_layer = get_act_layer(act_layer)
-    return act_layer
+    return get_act_layer(kwargs.pop('act_layer', default))
 
 
 def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None, round_limit=0.9):
diff --git a/timm/models/ghostnet.py b/timm/models/ghostnet.py
index c132142a..1783ff7a 100644
--- a/timm/models/ghostnet.py
+++ b/timm/models/ghostnet.py
@@ -13,7 +13,7 @@ import torch.nn.functional as F
 
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from .layers import SelectAdaptivePool2d, Linear, hard_sigmoid, make_divisible
+from .layers import SelectAdaptivePool2d, Linear, make_divisible
 from .efficientnet_blocks import SqueezeExcite, ConvBnAct
 from .helpers import build_model_with_cfg
 from .registry import register_model
@@ -40,7 +40,7 @@ default_cfgs = {
 }
 
 
-_SE_LAYER = partial(SqueezeExcite, gate_fn=hard_sigmoid, divisor=4)
+_SE_LAYER = partial(SqueezeExcite, gate_fn='hard_sigmoid', divisor=4)
 
 
 class GhostModule(nn.Module):
diff --git a/timm/models/layers/create_act.py b/timm/models/layers/create_act.py
index 426c3681..aa557692 100644
--- a/timm/models/layers/create_act.py
+++ b/timm/models/layers/create_act.py
@@ -1,20 +1,26 @@
 """ Activation Factory
 Hacked together by / Copyright 2020 Ross Wightman
 """
+from typing import Union, Callable, Type
+
 from .activations import *
 from .activations_jit import *
 from .activations_me import *
 from .config import is_exportable, is_scriptable, is_no_jit
 
-# PyTorch has an optimized, native 'silu' (aka 'swish') operator as of PyTorch 1.7. This code
-# will use native version if present. Eventually, the custom Swish layers will be removed
-# and only native 'silu' will be used.
+# PyTorch has an optimized, native 'silu' (aka 'swish') operator as of PyTorch 1.7.
+# Also hardsigmoid, hardswish, and soon mish. This code will use native version if present.
+# Eventually, the custom SiLU, Mish, Hard*, layers will be removed and only native variants will be used.
 _has_silu = 'silu' in dir(torch.nn.functional)
+_has_hardswish = 'hardswish' in dir(torch.nn.functional)
+_has_hardsigmoid = 'hardsigmoid' in dir(torch.nn.functional)
+_has_mish = 'mish' in dir(torch.nn.functional)
+
 
 _ACT_FN_DEFAULT = dict(
     silu=F.silu if _has_silu else swish,
     swish=F.silu if _has_silu else swish,
-    mish=mish,
+    mish=F.mish if _has_mish else mish,
     relu=F.relu,
     relu6=F.relu6,
     leaky_relu=F.leaky_relu,
@@ -24,33 +30,39 @@ _ACT_FN_DEFAULT = dict(
     gelu=gelu,
     sigmoid=sigmoid,
     tanh=tanh,
-    hard_sigmoid=hard_sigmoid,
-    hard_swish=hard_swish,
+    hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid,
+    hard_swish=F.hardswish if _has_hardswish else hard_swish,
     hard_mish=hard_mish,
 )
 
 _ACT_FN_JIT = dict(
     silu=F.silu if _has_silu else swish_jit,
     swish=F.silu if _has_silu else swish_jit,
-    mish=mish_jit,
-    hard_sigmoid=hard_sigmoid_jit,
-    hard_swish=hard_swish_jit,
+    mish=F.mish if _has_mish else mish_jit,
+    hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid_jit,
+    hard_swish=F.hardswish if _has_hardswish else hard_swish_jit,
     hard_mish=hard_mish_jit
 )
 
 _ACT_FN_ME = dict(
     silu=F.silu if _has_silu else swish_me,
     swish=F.silu if _has_silu else swish_me,
-    mish=mish_me,
-    hard_sigmoid=hard_sigmoid_me,
-    hard_swish=hard_swish_me,
+    mish=F.mish if _has_mish else mish_me,
+    hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid_me,
+    hard_swish=F.hardswish if _has_hardswish else hard_swish_me,
     hard_mish=hard_mish_me,
 )
 
+_ACT_FNS = (_ACT_FN_ME, _ACT_FN_JIT, _ACT_FN_DEFAULT)
+for a in _ACT_FNS:
+    a.setdefault('hardsigmoid', a.get('hard_sigmoid'))
+    a.setdefault('hardswish', a.get('hard_swish'))
+
+
 _ACT_LAYER_DEFAULT = dict(
     silu=nn.SiLU if _has_silu else Swish,
     swish=nn.SiLU if _has_silu else Swish,
-    mish=Mish,
+    mish=nn.Mish if _has_mish else Mish,
     relu=nn.ReLU,
     relu6=nn.ReLU6,
     leaky_relu=nn.LeakyReLU,
@@ -61,37 +73,44 @@ _ACT_LAYER_DEFAULT = dict(
     gelu=GELU,
     sigmoid=Sigmoid,
     tanh=Tanh,
-    hard_sigmoid=HardSigmoid,
-    hard_swish=HardSwish,
+    hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoid,
+    hard_swish=nn.Hardswish if _has_hardswish else HardSwish,
     hard_mish=HardMish,
 )
 
 _ACT_LAYER_JIT = dict(
     silu=nn.SiLU if _has_silu else SwishJit,
     swish=nn.SiLU if _has_silu else SwishJit,
-    mish=MishJit,
-    hard_sigmoid=HardSigmoidJit,
-    hard_swish=HardSwishJit,
+    mish=nn.Mish if _has_mish else MishJit,
+    hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoidJit,
+    hard_swish=nn.Hardswish if _has_hardswish else HardSwishJit,
     hard_mish=HardMishJit
 )
 
 _ACT_LAYER_ME = dict(
     silu=nn.SiLU if _has_silu else SwishMe,
     swish=nn.SiLU if _has_silu else SwishMe,
-    mish=MishMe,
-    hard_sigmoid=HardSigmoidMe,
-    hard_swish=HardSwishMe,
+    mish=nn.Mish if _has_mish else MishMe,
+    hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoidMe,
+    hard_swish=nn.Hardswish if _has_hardswish else HardSwishMe,
     hard_mish=HardMishMe,
 )
 
+_ACT_LAYERS = (_ACT_LAYER_ME, _ACT_LAYER_JIT, _ACT_LAYER_DEFAULT)
+for a in _ACT_LAYERS:
+    a.setdefault('hardsigmoid', a.get('hard_sigmoid'))
+    a.setdefault('hardswish', a.get('hard_swish'))
+
 
-def get_act_fn(name='relu'):
+def get_act_fn(name: Union[Callable, str] = 'relu'):
     """ Activation Function Factory
     Fetching activation fns by name with this function allows export or torch script friendly
     functions to be returned dynamically based on current config.
     """
     if not name:
         return None
+    if isinstance(name, Callable):
+        return name
     if not (is_no_jit() or is_exportable() or is_scriptable()):
         # If not exporting or scripting the model, first look for a memory-efficient version with
         # custom autograd, then fallback
@@ -106,13 +125,15 @@ def get_act_fn(name='relu'):
     return _ACT_FN_DEFAULT[name]
 
 
-def get_act_layer(name='relu'):
+def get_act_layer(name: Union[Type[nn.Module], str] = 'relu'):
     """ Activation Layer Factory
     Fetching activation layers by name with this function allows export or torch script friendly
     functions to be returned dynamically based on current config.
     """
     if not name:
         return None
+    if isinstance(name, type):
+        return name
     if not (is_no_jit() or is_exportable() or is_scriptable()):
         if name in _ACT_LAYER_ME:
             return _ACT_LAYER_ME[name]
@@ -125,9 +146,8 @@ def get_act_layer(name='relu'):
     return _ACT_LAYER_DEFAULT[name]
 
 
-def create_act_layer(name, inplace=False, **kwargs):
+def create_act_layer(name: Union[nn.Module, str], inplace=None, **kwargs):
     act_layer = get_act_layer(name)
-    if act_layer is not None:
-        return act_layer(inplace=inplace, **kwargs)
-    else:
+    if act_layer is None:
         return None
+    return act_layer(**kwargs) if inplace is None else act_layer(inplace=inplace, **kwargs)
diff --git a/timm/models/layers/se.py b/timm/models/layers/se.py
index 54c0ef33..4354144d 100644
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@@ -42,7 +42,7 @@ class EffectiveSEModule(nn.Module):
     def __init__(self, channels, gate_layer='hard_sigmoid'):
         super(EffectiveSEModule, self).__init__()
         self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
-        self.gate = create_act_layer(gate_layer, inplace=True)
+        self.gate = create_act_layer(gate_layer)
 
     def forward(self, x):
         x_se = x.mean((2, 3), keepdim=True)
diff --git a/timm/models/levit.py b/timm/models/levit.py
index 5019ee9a..2180254a 100644
--- a/timm/models/levit.py
+++ b/timm/models/levit.py
@@ -33,7 +33,7 @@ import torch.nn as nn
 
 from timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN
 from .helpers import build_model_with_cfg, overlay_external_default_cfg
-from .layers import to_ntuple
+from .layers import to_ntuple, get_act_layer
 from .vision_transformer import trunc_normal_
 from .registry import register_model
 
@@ -443,12 +443,14 @@ class Levit(nn.Module):
             mlp_ratio=2,
             hybrid_backbone=None,
             down_ops=None,
-            act_layer=nn.Hardswish,
-            attn_act_layer=nn.Hardswish,
+            act_layer='hard_swish',
+            attn_act_layer='hard_swish',
             distillation=True,
             use_conv=False,
             drop_path=0):
         super().__init__()
+        act_layer = get_act_layer(act_layer)
+        attn_act_layer = get_act_layer(attn_act_layer)
         if isinstance(img_size, tuple):
             # FIXME origin impl passes single img/res dim through whole hierarchy,
             # not sure this model will be used enough to spend time fixing it.