diff --git a/README.md b/README.md
index 572109de..c4f3a588 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,11 @@
 
 ## What's New
 
+### Feb 12, 2021
+* Update Normalization-Free nets to include new NFNet-F (https://arxiv.org/abs/2102.06171) model defs
+
 ### Feb 10, 2021
-* First Normalizer-Free model training experiments done,
+* First Normalization-Free model training experiments done,
   * nf_resnet50 - 80.68 top-1 @ 288x288, 80.31 @ 256x256
   * nf_regnet_b1 - 79.30 @ 288x288, 78.75 @ 256x256
 * More model archs, incl a flexible ByobNet backbone ('Bring-your-own-blocks')
@@ -164,6 +167,7 @@ A full version of the list below with source links can be found in the [document
 * Inception-ResNet-V2 and Inception-V4 - https://arxiv.org/abs/1602.07261
 * MobileNet-V3 (MBConvNet w/ Efficient Head) - https://arxiv.org/abs/1905.02244
 * NASNet-A - https://arxiv.org/abs/1707.07012
+* NFNet-F - https://arxiv.org/abs/2102.06171
 * NF-RegNet / NF-ResNet - https://arxiv.org/abs/2101.08692
 * PNasNet - https://arxiv.org/abs/1712.00559
 * RegNet - https://arxiv.org/abs/2003.13678
diff --git a/timm/models/nfnet.py b/timm/models/nfnet.py
index 4dc848ba..1f83f6df 100644
--- a/timm/models/nfnet.py
+++ b/timm/models/nfnet.py
@@ -236,7 +236,7 @@ class DownsampleAvg(nn.Module):
 
 
 class NormFreeBlock(nn.Module):
-    """Normalization-free pre-activation block.
+    """Normalization-Free pre-activation block.
     """
 
     def __init__(
@@ -351,6 +351,7 @@ def create_stem(in_chs, out_chs, stem_type='', conv_layer=None, act_layer=None):
     return nn.Sequential(stem), stem_stride, stem_feature
 
 
+# from https://github.com/deepmind/deepmind-research/tree/master/nfnets
 _nonlin_gamma = dict(
     identity=1.0,
     celu=1.270926833152771,
@@ -371,10 +372,13 @@ _nonlin_gamma = dict(
 
 
 class NormFreeNet(nn.Module):
-    """ Normalization-free ResNets and RegNets
+    """ Normalization-Free Network
 
-    As described in `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
+    As described in :
+    `Characterizing signal propagation to close the performance gap in unnormalized ResNets`
         - https://arxiv.org/abs/2101.08692
+    and
+    `High-Performance Large-Scale Image Recognition Without Normalization` - https://arxiv.org/abs/2102.06171
 
     This model aims to cover both the NFRegNet-Bx models as detailed in the paper's code snippets and
     the (preact) ResNet models described earlier in the paper.
@@ -432,7 +436,7 @@ class NormFreeNet(nn.Module):
                 blocks += [NormFreeBlock(
                     in_chs=prev_chs, out_chs=out_chs,
                     alpha=cfg.alpha,
-                    beta=1. / expected_var ** 0.5,  # NOTE: beta used as multiplier in block
+                    beta=1. / expected_var ** 0.5,
                     stride=stride if block_idx == 0 else 1,
                     dilation=dilation,
                     first_dilation=first_dilation,
@@ -477,8 +481,6 @@ class NormFreeNet(nn.Module):
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Conv2d):
-                # as per discussion with paper authors, original in haiku is
-                # hk.initializers.VarianceScaling(1.0, 'fan_in', 'normal')' w/ zero'd bias
                 nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear')
                 if m.bias is not None:
                     nn.init.zeros_(m.bias)