diff --git a/timm/models/convnext.py b/timm/models/convnext.py
index 1aacef2b..4660d860 100644
--- a/timm/models/convnext.py
+++ b/timm/models/convnext.py
@@ -108,8 +108,17 @@ class LayerNorm2d(nn.LayerNorm):
 
     def forward(self, x) -> torch.Tensor:
         if _is_contiguous(x):
+            # still faster than going to alternate implementation
+            # call contiguous at the end, because otherwise the rest of the model is computed in channels-last
             return F.layer_norm(
-                x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2)
+                x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2).contiguous()
+        elif x.is_contiguous(memory_format=torch.channels_last):
+            x = x.permute(0,2,3,1)
+            # trick nvfuser into picking up layer norm, even though it's a single op
+            # it's a slight pessimization (~.2%) if nvfuser is not enabled
+            x = F.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps) * 1.
+            return x.permute(0, 3, 1, 2)
         else:
             s, u = torch.var_mean(x, dim=1, unbiased=False, keepdim=True)
             x = (x - u) * torch.rsqrt(s + self.eps)