diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
index 6b2dabba..8e555b8b 100644
--- a/timm/layers/__init__.py
+++ b/timm/layers/__init__.py
@@ -3,7 +3,7 @@ from .adaptive_avgmax_pool import \
     adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
 from .attention_pool2d import AttentionPool2d, RotAttentionPool2d, RotaryEmbedding
 from .blur_pool import BlurPool2d
-from .classifier import ClassifierHead, create_classifier
+from .classifier import ClassifierHead, create_classifier, NormMlpClassifierHead
 from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\
     set_layer_config
diff --git a/timm/layers/classifier.py b/timm/layers/classifier.py
index e885084c..d93d0ec7 100644
--- a/timm/layers/classifier.py
+++ b/timm/layers/classifier.py
@@ -2,10 +2,17 @@
 
 Hacked together by / Copyright 2020 Ross Wightman
 """
-from torch import nn as nn
+from collections import OrderedDict
+from functools import partial
+from typing import Optional, Union, Callable
+
+import torch
+import torch.nn as nn
 from torch.nn import functional as F
 
 from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .create_act import get_act_layer
+from .create_norm import get_norm_layer
 
 
 def _create_pool(num_features, num_classes, pool_type='avg', use_conv=False):
@@ -38,7 +45,21 @@ def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False
 class ClassifierHead(nn.Module):
     """Classifier head w/ configurable global pooling and dropout."""
 
-    def __init__(self, in_features, num_classes, pool_type='avg', drop_rate=0., use_conv=False):
+    def __init__(
+            self,
+            in_features: int,
+            num_classes: int,
+            pool_type: str = 'avg',
+            drop_rate: float = 0.,
+            use_conv: bool = False,
+    ):
+        """
+        Args:
+            in_features: The number of input features.
+            num_classes:  The number of classes for the final classifier layer (output).
+            pool_type: Global pooling type, pooling disabled if empty string ('').
+            drop_rate: Pre-classifier dropout rate.
+        """
         super(ClassifierHead, self).__init__()
         self.drop_rate = drop_rate
         self.in_features = in_features
@@ -65,3 +86,76 @@ class ClassifierHead(nn.Module):
         else:
             x = self.fc(x)
             return self.flatten(x)
+
+
+class NormMlpClassifierHead(nn.Module):
+
+    def __init__(
+            self,
+            in_features: int,
+            num_classes: int,
+            hidden_size: Optional[int] = None,
+            pool_type: str = 'avg',
+            drop_rate: float = 0.,
+            norm_layer: Union[str, Callable] = 'layernorm2d',
+            act_layer: Union[str, Callable] = 'tanh',
+    ):
+        """
+        Args:
+            in_features: The number of input features.
+            num_classes:  The number of classes for the final classifier layer (output).
+            hidden_size: The hidden size of the MLP (pre-logits FC layer) if not None.
+            pool_type: Global pooling type, pooling disabled if empty string ('').
+            drop_rate: Pre-classifier dropout rate.
+            norm_layer: Normalization layer type.
+            act_layer: MLP activation layer type (only used if hidden_size is not None).
+        """
+        super().__init__()
+        self.drop_rate = drop_rate
+        self.in_features = in_features
+        self.hidden_size = hidden_size
+        self.num_features = in_features
+        self.use_conv = not pool_type
+        norm_layer = get_norm_layer(norm_layer)
+        act_layer = get_act_layer(act_layer)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
+
+        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
+        self.norm = norm_layer(in_features)
+        self.flatten = nn.Flatten(1) if pool_type else nn.Identity()
+        if hidden_size:
+            self.pre_logits = nn.Sequential(OrderedDict([
+                ('fc', linear_layer(in_features, hidden_size)),
+                ('act', act_layer()),
+            ]))
+            self.num_features = hidden_size
+        else:
+            self.pre_logits = nn.Identity()
+        self.drop = nn.Dropout(self.drop_rate)
+        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def reset(self, num_classes, global_pool=None):
+        if global_pool is not None:
+            self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+            self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
+        self.use_conv = self.global_pool.is_identity()
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
+        if self.hidden_size:
+            if ((isinstance(self.pre_logits.fc, nn.Conv2d) and not self.use_conv) or
+                    (isinstance(self.pre_logits.fc, nn.Linear) and self.use_conv)):
+                with torch.no_grad():
+                    new_fc = linear_layer(self.in_features, self.hidden_size)
+                    new_fc.weight.copy_(self.pre_logits.fc.weight.reshape(new_fc.weight.shape))
+                    new_fc.bias.copy_(self.pre_logits.fc.bias)
+                    self.pre_logits.fc = new_fc
+        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward(self, x, pre_logits: bool = False):
+        x = self.global_pool(x)
+        x = self.norm(x)
+        x = self.flatten(x)
+        x = self.pre_logits(x)
+        if pre_logits:
+            return x
+        x = self.fc(x)
+        return x
diff --git a/timm/models/convnext.py b/timm/models/convnext.py
index 2bbe0b11..1655ad34 100644
--- a/timm/models/convnext.py
+++ b/timm/models/convnext.py
@@ -39,6 +39,7 @@ Modifications and additions for timm hacked together by / Copyright 2022, Ross W
 
 from collections import OrderedDict
 from functools import partial
+from typing import Callable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -46,6 +47,7 @@ import torch.nn as nn
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 from timm.layers import trunc_normal_, SelectAdaptivePool2d, DropPath, Mlp, GlobalResponseNormMlp, \
     LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
+from timm.layers import NormMlpClassifierHead, ClassifierHead
 from ._builder import build_model_with_cfg
 from ._manipulate import named_apply, checkpoint_seq
 from ._pretrained import generate_default_cfgs
@@ -188,48 +190,50 @@ class ConvNeXt(nn.Module):
 
     def __init__(
             self,
-            in_chans=3,
-            num_classes=1000,
-            global_pool='avg',
-            output_stride=32,
-            depths=(3, 3, 9, 3),
-            dims=(96, 192, 384, 768),
-            kernel_sizes=7,
-            ls_init_value=1e-6,
-            stem_type='patch',
-            patch_size=4,
-            head_init_scale=1.,
-            head_norm_first=False,
-            conv_mlp=False,
-            conv_bias=True,
-            use_grn=False,
-            act_layer='gelu',
-            norm_layer=None,
-            norm_eps=None,
-            drop_rate=0.,
-            drop_path_rate=0.,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            output_stride: int = 32,
+            depths: Tuple[int, ...] = (3, 3, 9, 3),
+            dims: Tuple[int, ...] = (96, 192, 384, 768),
+            kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+            ls_init_value: Optional[float] = 1e-6,
+            stem_type: str = 'patch',
+            patch_size: int = 4,
+            head_init_scale: float = 1.,
+            head_norm_first: bool = False,
+            head_hidden_size: Optional[int] = None,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Union[str, Callable]] = None,
+            norm_eps: Optional[float] = None,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
     ):
         """
         Args:
-            in_chans (int): Number of input image channels (default: 3)
-            num_classes (int): Number of classes for classification head (default: 1000)
-            global_pool (str): Global pooling type (default: 'avg')
-            output_stride (int): Output stride of network, one of (8, 16, 32) (default: 32)
-            depths (tuple(int)): Number of blocks at each stage. (default: [3, 3, 9, 3])
-            dims (tuple(int)): Feature dimension at each stage. (default: [96, 192, 384, 768])
-            kernel_sizes (Union[int, List[int]]: Depthwise convolution kernel-sizes for each stage (default: 7)
-            ls_init_value (float): Init value for Layer Scale (default: 1e-6)
-            stem_type (str): Type of stem (default: 'patch')
-            patch_size (int): Stem patch size for patch stem (default: 4)
-            head_init_scale (float): Init scaling value for classifier weights and biases (default: 1)
-            head_norm_first (bool): Apply normalization before global pool + head (default: False)
-            conv_mlp (bool): Use 1x1 conv in MLP, improves speed for small networks w/ chan last (default: False)
-            conv_bias (bool): Use bias layers w/ all convolutions (default: True)
-            use_grn (bool): Use Global Response Norm (ConvNeXt-V2) in MLP (default: False)
-            act_layer (Union[str, nn.Module]): Activation Layer
-            norm_layer (Union[str, nn.Module]): Normalization Layer
-            drop_rate (float): Head dropout rate (default: 0.)
-            drop_path_rate (float): Stochastic depth rate (default: 0.)
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
         """
         super().__init__()
         assert output_stride in (8, 16, 32)
@@ -307,14 +311,26 @@ class ConvNeXt(nn.Module):
 
         # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
         # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
-        self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity()
-        self.head = nn.Sequential(OrderedDict([
-                ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
-                ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)),
-                ('flatten', nn.Flatten(1) if global_pool else nn.Identity()),
-                ('drop', nn.Dropout(self.drop_rate)),
-                ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())]))
-
+        if head_norm_first:
+            assert not head_hidden_size
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                hidden_size=head_hidden_size,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+                act_layer='gelu',
+            )
         named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
 
     @torch.jit.ignore
@@ -338,10 +354,7 @@ class ConvNeXt(nn.Module):
         return self.head.fc
 
     def reset_classifier(self, num_classes=0, global_pool=None):
-        if global_pool is not None:
-            self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-            self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity()
-        self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head.reset(num_classes, global_pool=global_pool)
 
     def forward_features(self, x):
         x = self.stem(x)
@@ -350,12 +363,7 @@ class ConvNeXt(nn.Module):
         return x
 
     def forward_head(self, x, pre_logits: bool = False):
-        # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :(
-        x = self.head.global_pool(x)
-        x = self.head.norm(x)
-        x = self.head.flatten(x)
-        x = self.head.drop(x)
-        return x if pre_logits else self.head.fc(x)
+        return self.head(x, pre_logits=pre_logits)
 
     def forward(self, x):
         x = self.forward_features(x)
diff --git a/timm/models/davit.py b/timm/models/davit.py
index 8b9e67b4..e9871265 100644
--- a/timm/models/davit.py
+++ b/timm/models/davit.py
@@ -23,6 +23,7 @@ from torch import Tensor
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from timm.layers import DropPath, to_2tuple, trunc_normal_, SelectAdaptivePool2d, Mlp, LayerNorm2d, get_norm_layer
+from timm.layers import NormMlpClassifierHead, ClassifierHead
 from ._builder import build_model_with_cfg
 from ._features_fx import register_notrace_function
 from ._manipulate import checkpoint_seq
@@ -519,14 +520,23 @@ class DaViT(nn.Module):
         # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
         # otherwise pool -> norm -> fc, the default DaViT order, similar to ConvNeXt
         # FIXME generalize this structure to ClassifierHead
-        self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity()
-        self.head = nn.Sequential(OrderedDict([
-            ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)),
-            ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)),
-            ('flatten', nn.Flatten(1) if global_pool else nn.Identity()),
-            ('drop', nn.Dropout(self.drop_rate)),
-            ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())]))
-
+        if head_norm_first:
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+            )
         self.apply(self._init_weights)
 
     def _init_weights(self, m):
@@ -546,10 +556,7 @@ class DaViT(nn.Module):
         return self.head.fc
 
     def reset_classifier(self, num_classes, global_pool=None):
-        if global_pool is not None:
-            self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-            self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity()
-        self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        self.head.reset(num_classes, global_pool=global_pool)
 
     def forward_features(self, x):
         x = self.stem(x)
diff --git a/timm/models/maxxvit.py b/timm/models/maxxvit.py
index e730fa30..f41dba8b 100644
--- a/timm/models/maxxvit.py
+++ b/timm/models/maxxvit.py
@@ -44,7 +44,7 @@ import torch
 from torch import nn
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from timm.layers import Mlp, ConvMlp, DropPath, ClassifierHead, LayerNorm, SelectAdaptivePool2d
+from timm.layers import Mlp, ConvMlp, DropPath, LayerNorm, ClassifierHead, NormMlpClassifierHead
 from timm.layers import create_attn, get_act_layer, get_norm_layer, get_norm_act_layer, create_conv2d, create_pool2d
 from timm.layers import trunc_normal_tf_, to_2tuple, extend_tuple, make_divisible, _assert
 from timm.layers import RelPosMlp, RelPosBias, RelPosBiasTf
@@ -1072,69 +1072,6 @@ def cfg_window_size(cfg: MaxxVitTransformerCfg, img_size: Tuple[int, int]):
     return cfg
 
 
-class NormMlpHead(nn.Module):
-
-    def __init__(
-            self,
-            in_features,
-            num_classes,
-            hidden_size=None,
-            pool_type='avg',
-            drop_rate=0.,
-            norm_layer='layernorm2d',
-            act_layer='tanh',
-    ):
-        super().__init__()
-        self.drop_rate = drop_rate
-        self.in_features = in_features
-        self.hidden_size = hidden_size
-        self.num_features = in_features
-        self.use_conv = not pool_type
-        norm_layer = get_norm_layer(norm_layer)
-        act_layer = get_act_layer(act_layer)
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
-
-        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
-        self.norm = norm_layer(in_features)
-        self.flatten = nn.Flatten(1) if pool_type else nn.Identity()
-        if hidden_size:
-            self.pre_logits = nn.Sequential(OrderedDict([
-                ('fc', linear_layer(in_features, hidden_size)),
-                ('act', act_layer()),
-            ]))
-            self.num_features = hidden_size
-        else:
-            self.pre_logits = nn.Identity()
-        self.drop = nn.Dropout(self.drop_rate)
-        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def reset(self, num_classes, global_pool=None):
-        if global_pool is not None:
-            self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-            self.flatten = nn.Flatten(1) if global_pool else nn.Identity()
-        self.use_conv = self.global_pool.is_identity()
-        linear_layer = partial(nn.Conv2d, kernel_size=1) if self.use_conv else nn.Linear
-        if self.hidden_size:
-            if ((isinstance(self.pre_logits.fc, nn.Conv2d) and not self.use_conv) or
-                    (isinstance(self.pre_logits.fc, nn.Linear) and self.use_conv)):
-                with torch.no_grad():
-                    new_fc = linear_layer(self.in_features, self.hidden_size)
-                    new_fc.weight.copy_(self.pre_logits.fc.weight.reshape(new_fc.weight.shape))
-                    new_fc.bias.copy_(self.pre_logits.fc.bias)
-                    self.pre_logits.fc = new_fc
-        self.fc = linear_layer(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-
-    def forward(self, x, pre_logits: bool = False):
-        x = self.global_pool(x)
-        x = self.norm(x)
-        x = self.flatten(x)
-        x = self.pre_logits(x)
-        if pre_logits:
-            return x
-        x = self.fc(x)
-        return x
-
-
 def _overlay_kwargs(cfg: MaxxVitCfg, **kwargs):
     transformer_kwargs = {}
     conv_kwargs = {}
@@ -1225,7 +1162,7 @@ class MaxxVit(nn.Module):
         self.head_hidden_size = cfg.head_hidden_size
         if self.head_hidden_size:
             self.norm = nn.Identity()
-            self.head = NormMlpHead(
+            self.head = NormMlpClassifierHead(
                 self.num_features,
                 num_classes,
                 hidden_size=self.head_hidden_size,