diff --git a/timm/models/vision_transformer.py b/timm/models/vision_transformer.py
index aed295ec..5fb5c7c7 100644
--- a/timm/models/vision_transformer.py
+++ b/timm/models/vision_transformer.py
@@ -373,7 +373,7 @@ class VisionTransformer(nn.Module):
     def __init__(self, img_size=224, patch_size=None, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
                  num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
                  drop_rate=0., attn_drop_rate=0., drop_path_rate=0., hybrid_backbone=None, norm_layer=None,
-                 act_layer=None):
+                 act_layer=None, weight_init=''):
         """
         Args:
             img_size (int, tuple): input image size
@@ -434,17 +434,13 @@ class VisionTransformer(nn.Module):
         self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
 
         trunc_normal_(self.pos_embed, std=.02)
-        trunc_normal_(self.cls_token, std=.02)
-        self.apply(self._init_weights)
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
+        if weight_init != 'jax':  # leave as zeros to match JAX impl
+            trunc_normal_(self.cls_token, std=.02)
+        for n, m in self.named_modules():
+            if weight_init == 'jax':
+                _init_weights_jax(m, n)
+            else:
+                _init_weights_original(m, n)
 
     @torch.jit.ignore
     def no_weight_decay(self):
@@ -479,6 +475,58 @@ class VisionTransformer(nn.Module):
         return x
 
 
+def _init_weights_original(m: nn.Module, n: str = ''):
+    if isinstance(m, nn.Linear):
+        trunc_normal_(m.weight, std=.02)
+        if isinstance(m, nn.Linear) and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.zeros_(m.bias)
+        nn.init.ones_(m.weight)
+
+
+def _init_weights_jax(m: nn.Module, n: str):
+    """ Weight init scheme closer to the official JAX impl than my original init"""
+
+    def _fan_in(tensor):
+        dimensions = tensor.dim()
+        if dimensions < 2:
+            raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
+
+        num_input_fmaps = tensor.size(1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[0][0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        return fan_in
+
+    def _lecun_normal(w):
+        stddev = (1.0 / _fan_in(w)) ** 0.5 / .87962566103423978
+        trunc_normal_(w, 0, stddev)
+
+    if isinstance(m, nn.Linear):
+        if 'head' in n:
+            nn.init.zeros_(m.weight)
+            nn.init.zeros_(m.bias)
+        elif 'pre_logits' in n:
+            _lecun_normal(m.weight)
+            nn.init.zeros_(m.bias)
+        else:
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                if 'mlp' in n:
+                    nn.init.normal_(m.bias, 0, 1e-6)
+                else:
+                    nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.Conv2d):
+        _lecun_normal(m.weight)
+        if m.bias is not None:
+            nn.init.zeros_(m.bias)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.constant_(m.bias, 0.)
+        nn.init.constant_(m.weight, 1.)
+
+
 class DistilledVisionTransformer(VisionTransformer):
     """ Vision Transformer with distillation token.
 
@@ -496,7 +544,7 @@ class DistilledVisionTransformer(VisionTransformer):
 
         trunc_normal_(self.dist_token, std=.02)
         trunc_normal_(self.pos_embed, std=.02)
-        self.head_dist.apply(self._init_weights)
+        self.head_dist.apply(_init_weights_original)
 
     def forward_features(self, x):
         B = x.shape[0]