diff --git a/timm/data/textdataset.py b/timm/data/textdataset.py
index 895b7c2d..a1861d2d 100644
--- a/timm/data/textdataset.py
+++ b/timm/data/textdataset.py
@@ -24,9 +24,9 @@ class TextDataset(Dataset):
         # Go to file idx//32
         # Get label(1x1) based on file name
         # Get vector(1x4096) at idx%32 in the file
-        #return a tensor x*y (x*y = 4096) and target tensor (1,) //Use x,y =64
-
+        #return a tensor x*y (x*y = 4096) and target tensor (1,) //Use x,y = 16,256
 
+        #print("idx=", idx)
         def listdir_nohidden(AllVideos_Path):  # To ignore hidden files
             file_dir_extension = os.path.join(AllVideos_Path, '*.txt')
             for f in glob.glob(file_dir_extension):
@@ -44,17 +44,19 @@ class TextDataset(Dataset):
         words = f.read().split()
         features = np.float32(words[feat * 4096:feat * 4096 + 4096])
         features = torch.tensor(features)
-        features = torch.reshape(features, (16, 256))
-        print(VideoPath)
+        # features = torch.reshape(features, (16, 256))
+        # features = torch.reshape(features, (196, 768))
+        features = torch.reshape(features, (1, 4096))
+        #print(VideoPath)
         if VideoPath.find('Normal') == -1:
             label = 0
         else:
             label = 1
 
         label = torch.tensor(label)
-        #print(features.shape)
+        print(features.shape)
         #print(features)
-        #print(label.shape)
-        print(label)
+        print(label.shape)
+        #print(label)
 
         return features, label
diff --git a/timm/models/helpers.py b/timm/models/helpers.py
index 880fcc63..0e3d304f 100644
--- a/timm/models/helpers.py
+++ b/timm/models/helpers.py
@@ -190,6 +190,15 @@ def load_pretrained(model, default_cfg=None, num_classes=1000, in_chans=3, filte
     elif hf_hub_id and has_hf_hub(necessary=True):
         _logger.info(f'Loading pretrained weights from Hugging Face hub ({hf_hub_id})')
         state_dict = load_state_dict_from_hf(hf_hub_id)
+    print("pretrain state_dict:")
+    print(type(state_dict))
+    print(len(state_dict))
+    for key in list(state_dict.keys()):
+        if key.startswith('stem'):
+            del state_dict[key]
+    for param_tensor in state_dict:
+        print(param_tensor, "\t", state_dict[param_tensor].size())
+    
     if filter_fn is not None:
         # for backwards compat with filter fn that take one arg, try one first, the two
         try:
@@ -232,7 +241,9 @@ def load_pretrained(model, default_cfg=None, num_classes=1000, in_chans=3, filte
                 classifier_bias = state_dict[classifier_name + '.bias']
                 state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:]
 
-    model.load_state_dict(state_dict, strict=strict)
+    #print(state_dict.shape)
+    #model.load_state_dict(state_dict, strict=strict)
+    model.load_state_dict(state_dict, strict=False)
 
 
 def extract_layer(model, layer):
@@ -462,6 +473,7 @@ def build_model_with_cfg(
         if pretrained_custom_load:
             load_custom_pretrained(model)
         else:
+            print("num_classes_pretrained=",num_classes_pretrained)
             load_pretrained(
                 model,
                 num_classes=num_classes_pretrained,
diff --git a/timm/models/mlp_mixer.py b/timm/models/mlp_mixer.py
index c008bfb5..0444c59a 100644
--- a/timm/models/mlp_mixer.py
+++ b/timm/models/mlp_mixer.py
@@ -85,7 +85,7 @@ default_cfgs = dict(
     # Mixer ImageNet-21K-P pretraining
     mixer_b16_224_miil_in21k=_cfg(
         url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/mixer_b16_224_miil_in21k.pth',
-        mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', num_classes=1, #11221
+        mean=(0, 0, 0), std=(1, 1, 1), crop_pct=0.875, interpolation='bilinear', num_classes=11221,
     ),
     mixer_b16_224_miil=_cfg(
         url='https://miil-public-eu.oss-eu-central-1.aliyuncs.com/model-zoo/ImageNet_21K_P/models/timm/mixer_b16_224_miil.pth',
@@ -264,7 +264,7 @@ class MlpMixer(nn.Module):
         super().__init__()
         self.num_classes = num_classes
         self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-
+        self.initial_fc =nn.Linear(4096, 150528)
         self.stem = PatchEmbed(
             img_size=img_size, patch_size=patch_size, in_chans=in_chans,
             embed_dim=embed_dim, norm_layer=norm_layer if stem_norm else None)
@@ -274,7 +274,7 @@ class MlpMixer(nn.Module):
         self.blocks = nn.Sequential(*[
             block_layer(
                 embed_dim
-                , 16 #self.stem.num_patches
+                ,16 #196 #self.stem.num_patches
                 , mlp_ratio, mlp_layer=mlp_layer, norm_layer=norm_layer,
                 act_layer=act_layer, drop=drop_rate, drop_path=drop_path_rate)
             for _ in range(num_blocks)])
@@ -286,24 +286,24 @@ class MlpMixer(nn.Module):
             for _ in range(num_blocks)])
         """
         self.norm = norm_layer(embed_dim)
-        self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
-        #self.head = nn.Sequential(
-        #     nn.Linear(embed_dim, self.num_classes),
-        #     nn.ReLU(),
-        #     nn.Dropout(p=0.3),
-        #     nn.Linear(self.num_classes, 1024),
-        #     nn.ReLU(),
-        #     nn.Dropout(p=0.3),
-        #     nn.Linear(1024, 512),
-        #     nn.ReLU(),
-        #     nn.Dropout(p=0.3),
-        #     nn.Linear(512, 256),
-        #     nn.ReLU(),
-        #     nn.Dropout(p=0.3),
-        #     nn.Linear(256, 1)
-        # )
+        # self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        self.head = nn.Sequential(
+            nn.Linear(embed_dim, self.num_classes),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(self.num_classes, 1024),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(256, 2)
+        )
         self.sigmoid = nn.Sigmoid()
-        #self.init_weights(nlhb=nlhb)
+        self.init_weights(nlhb=nlhb)
 
     def init_weights(self, nlhb=False):
         head_bias = -math.log(self.num_classes) if nlhb else 0.
@@ -318,7 +318,6 @@ class MlpMixer(nn.Module):
 
     def forward_features(self, x):
         #x = self.stem(x)
-        #print(x.shape)
         print("In_Model")
         x = self.blocks(x)
         print(x)
@@ -329,6 +328,8 @@ class MlpMixer(nn.Module):
         return x
 
     def forward(self, x):
+        x = self.initial_fc(x)
+        x = torch.reshape(x, (196, 768))
         x = self.forward_features(x)
         x = self.head(x)
         print(x)
@@ -384,7 +385,11 @@ def checkpoint_filter_fn(state_dict, model):
             if k.endswith('.alpha') or k.endswith('.beta'):
                 v = v.reshape(1, 1, -1)
             out_dict[k] = v
+        #print("checkpoint_filter_out_dict")
+        #print(out_dict)
         return out_dict
+    #print("checkpoint_filter_state_dict")
+    #print(state_dict)
     return state_dict
 
 
@@ -392,6 +397,9 @@ def _create_mixer(variant, pretrained=False, **kwargs):
     if kwargs.get('features_only', None):
         raise RuntimeError('features_only not implemented for MLP-Mixer models.')
 
+    print("_create_mixer")
+    print("Pretrained=",pretrained)
+    print("default_Cfgs=", default_cfgs[variant])
     model = build_model_with_cfg(
         MlpMixer, variant, pretrained,
         default_cfg=default_cfgs[variant],
@@ -495,7 +503,8 @@ def mixer_b16_224_miil_in21k(pretrained=False, **kwargs):
     """ Mixer-B/16 224x224. ImageNet-1k pretrained weights.
     Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
     """
-    model_args = dict(patch_size=16, num_blocks=12, embed_dim=256, **kwargs)
+    model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs)
+    #model_args = dict(patch_size=16, num_blocks=12, embed_dim=256, **kwargs)
     model = _create_mixer('mixer_b16_224_miil_in21k', pretrained=pretrained, **model_args)
     return model