diff --git a/README.md b/README.md index 572109de..c4f3a588 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,11 @@ ## What's New +### Feb 12, 2021 +* Update Normalization-Free nets to include new NFNet-F (https://arxiv.org/abs/2102.06171) model defs + ### Feb 10, 2021 -* First Normalizer-Free model training experiments done, +* First Normalization-Free model training experiments done, * nf_resnet50 - 80.68 top-1 @ 288x288, 80.31 @ 256x256 * nf_regnet_b1 - 79.30 @ 288x288, 78.75 @ 256x256 * More model archs, incl a flexible ByobNet backbone ('Bring-your-own-blocks') @@ -164,6 +167,7 @@ A full version of the list below with source links can be found in the [document * Inception-ResNet-V2 and Inception-V4 - https://arxiv.org/abs/1602.07261 * MobileNet-V3 (MBConvNet w/ Efficient Head) - https://arxiv.org/abs/1905.02244 * NASNet-A - https://arxiv.org/abs/1707.07012 +* NFNet-F - https://arxiv.org/abs/2102.06171 * NF-RegNet / NF-ResNet - https://arxiv.org/abs/2101.08692 * PNasNet - https://arxiv.org/abs/1712.00559 * RegNet - https://arxiv.org/abs/2003.13678 diff --git a/timm/models/nfnet.py b/timm/models/nfnet.py index 4dc848ba..1f83f6df 100644 --- a/timm/models/nfnet.py +++ b/timm/models/nfnet.py @@ -236,7 +236,7 @@ class DownsampleAvg(nn.Module): class NormFreeBlock(nn.Module): - """Normalization-free pre-activation block. + """Normalization-Free pre-activation block. """ def __init__( @@ -351,6 +351,7 @@ def create_stem(in_chs, out_chs, stem_type='', conv_layer=None, act_layer=None): return nn.Sequential(stem), stem_stride, stem_feature +# from https://github.com/deepmind/deepmind-research/tree/master/nfnets _nonlin_gamma = dict( identity=1.0, celu=1.270926833152771, @@ -371,10 +372,13 @@ _nonlin_gamma = dict( class NormFreeNet(nn.Module): - """ Normalization-free ResNets and RegNets + """ Normalization-Free Network - As described in `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + As described in : + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` - https://arxiv.org/abs/2101.08692 + and + `High-Performance Large-Scale Image Recognition Without Normalization` - https://arxiv.org/abs/2102.06171 This model aims to cover both the NFRegNet-Bx models as detailed in the paper's code snippets and the (preact) ResNet models described earlier in the paper. @@ -432,7 +436,7 @@ class NormFreeNet(nn.Module): blocks += [NormFreeBlock( in_chs=prev_chs, out_chs=out_chs, alpha=cfg.alpha, - beta=1. / expected_var ** 0.5, # NOTE: beta used as multiplier in block + beta=1. / expected_var ** 0.5, stride=stride if block_idx == 0 else 1, dilation=dilation, first_dilation=first_dilation, @@ -477,8 +481,6 @@ class NormFreeNet(nn.Module): if m.bias is not None: nn.init.zeros_(m.bias) elif isinstance(m, nn.Conv2d): - # as per discussion with paper authors, original in haiku is - # hk.initializers.VarianceScaling(1.0, 'fan_in', 'normal')' w/ zero'd bias nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear') if m.bias is not None: nn.init.zeros_(m.bias)