From 8bf63b6c6cc2b4ba69030cb043bf33cd562b399c Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Sun, 30 May 2021 12:47:02 -0700 Subject: [PATCH] Able to use other attn layer in EfficientNet now. Create test ECA + GC B0 configs. Make ECA more configurable. --- tests/test_models.py | 2 +- timm/models/efficientnet.py | 24 +++++++++++++++++++ timm/models/efficientnet_builder.py | 4 ++-- timm/models/layers/eca.py | 36 +++++++++++++++++++++-------- 4 files changed, 54 insertions(+), 12 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 18298dff..1093e609 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,7 +24,7 @@ NUM_NON_STD = len(NON_STD_FILTERS) if 'GITHUB_ACTIONS' in os.environ: # and 'Linux' in platform.system(): # GitHub Linux runner is slower and hits memory limits sooner than MacOS, exclude bigger models EXCLUDE_FILTERS = [ - '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', + '*efficientnet_l2*', '*resnext101_32x48d', '*in21k', '*152x4_bitm', '*101x3_bitm', '*50x3_bitm' '*nfnet_f3*', '*nfnet_f4*', '*nfnet_f5*', '*nfnet_f6*', '*nfnet_f7*', '*resnetrs350*', '*resnetrs420*'] else: diff --git a/timm/models/efficientnet.py b/timm/models/efficientnet.py index 8aa61ec5..09e47684 100644 --- a/timm/models/efficientnet.py +++ b/timm/models/efficientnet.py @@ -91,6 +91,12 @@ default_cfgs = { url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth', interpolation='bilinear'), + # NOTE experimenting with alternate attention + 'eca_efficientnet_b0': _cfg( + url=''), + 'gc_efficientnet_b0': _cfg( + url=''), + 'efficientnet_b0': _cfg( url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'), 'efficientnet_b1': _cfg( @@ -1223,6 +1229,24 @@ def efficientnet_b0(pretrained=False, **kwargs): return model +@register_model +def eca_efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ ECA attn """ + # NOTE experimental config + model = _gen_efficientnet( + 'eca_efficientnet_b0', se_layer='eca', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def gc_efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ GlobalContext """ + # NOTE experminetal config + model = _gen_efficientnet( + 'gc_efficientnet_b0', se_layer='gc', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + @register_model def efficientnet_b1(pretrained=False, **kwargs): """ EfficientNet-B1 """ diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py index f44cf158..a23e8273 100644 --- a/timm/models/efficientnet_builder.py +++ b/timm/models/efficientnet_builder.py @@ -278,9 +278,9 @@ class EfficientNetBuilder: self.norm_layer = norm_layer self.se_layer = get_attn(se_layer) try: - self.se_layer(8, rd_ratio=1.0) + self.se_layer(8, rd_ratio=1.0) # test if attn layer accepts rd_ratio arg self.se_has_ratio = True - except RuntimeError as e: + except TypeError: self.se_has_ratio = False self.drop_path_rate = drop_path_rate if feature_location == 'depthwise': diff --git a/timm/models/layers/eca.py b/timm/models/layers/eca.py index f2980730..5c024108 100644 --- a/timm/models/layers/eca.py +++ b/timm/models/layers/eca.py @@ -38,6 +38,9 @@ from torch import nn import torch.nn.functional as F +from .create_act import create_act_layer + + class EcaModule(nn.Module): """Constructs an ECA module. @@ -48,20 +51,27 @@ class EcaModule(nn.Module): refer to original paper https://arxiv.org/pdf/1910.03151.pdf (default=None. if channel size not given, use k_size given for kernel size.) kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use """ - def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1): + def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): super(EcaModule, self).__init__() - assert kernel_size % 2 == 1 if channels is not None: t = int(abs(math.log(channels, 2) + beta) / gamma) kernel_size = max(t if t % 2 else t + 1, 3) - - self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False) + assert kernel_size % 2 == 1 + has_act = act_layer is not None + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=has_act) + self.act = create_act_layer(act_layer) if has_act else nn.Identity() + self.gate = create_act_layer(gate_layer) def forward(self, x): y = x.mean((2, 3)).view(x.shape[0], 1, -1) # view for 1d conv y = self.conv(y) - y = y.view(x.shape[0], -1, 1, 1).sigmoid() + y = self.act(y) # NOTE: usually a no-op, added for experimentation + y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x) @@ -86,27 +96,35 @@ class CecaModule(nn.Module): refer to original paper https://arxiv.org/pdf/1910.03151.pdf (default=None. if channel size not given, use k_size given for kernel size.) kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use """ - def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1): + def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): super(CecaModule, self).__init__() - assert kernel_size % 2 == 1 if channels is not None: t = int(abs(math.log(channels, 2) + beta) / gamma) kernel_size = max(t if t % 2 else t + 1, 3) + has_act = act_layer is not None + assert kernel_size % 2 == 1 # PyTorch circular padding mode is buggy as of pytorch 1.4 # see https://github.com/pytorch/pytorch/pull/17240 # implement manual circular padding - self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False) self.padding = (kernel_size - 1) // 2 + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=has_act) + self.act = create_act_layer(act_layer) if has_act else nn.Identity() + self.gate = create_act_layer(gate_layer) def forward(self, x): y = x.mean((2, 3)).view(x.shape[0], 1, -1) # Manually implement circular padding, F.pad does not seemed to be bugged y = F.pad(y, (self.padding, self.padding), mode='circular') y = self.conv(y) - y = y.view(x.shape[0], -1, 1, 1).sigmoid() + y = self.act(y) # NOTE: usually a no-op, added for experimentation + y = self.gate(y).view(x.shape[0], -1, 1, 1) return x * y.expand_as(x)