diff --git a/out_log b/out_log
new file mode 100644
index 00000000..40955796
--- /dev/null
+++ b/out_log
@@ -0,0 +1,2011 @@
+_create_mixer
+Pretrained= False
+default_Cfgs= {'url': '', 'num_classes': 2, 'input_size': (3, 224, 224), 'pool_size': None, 'crop_pct': 0.875, 'interpolation': 'bicubic', 'fixed_input_size': True, 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), 'first_conv': 'stem.proj', 'classifier': 'head'}
+dataset_len= 288
+True
+<class 'torch.utils.data.dataloader.DataLoader'>
+dataset_len= 32
+False
+<class 'torch.utils.data.dataloader.DataLoader'>
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6931, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6927, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926,
+        0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926,
+        0.6926, 0.6926, 0.6926, 0.6925, 0.6926, 0.6926, 0.6926, 0.6926, 0.6926,
+        0.6926, 0.6926, 0.6925, 0.6926, 0.6937], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6927, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6916, 0.6916, 0.6918, 0.6917, 0.6918, 0.6916, 0.6916, 0.6918, 0.6918,
+        0.6917, 0.6917, 0.6917, 0.6917, 0.6917, 0.6916, 0.6917, 0.6917, 0.6916,
+        0.6947, 0.6917, 0.6917, 0.6917, 0.6917, 0.6917, 0.6917, 0.6917, 0.6918,
+        0.6916, 0.6916, 0.6916, 0.6917, 0.6918], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931, 0.6931,
+        0.6931, 0.6931, 0.6931, 0.6931, 0.6931], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6919, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6903, 0.6907, 0.6903, 0.6904, 0.6907, 0.6906, 0.6907, 0.6903, 0.6904,
+        0.6905, 0.6903, 0.6959, 0.6904, 0.6906, 0.6905, 0.6906, 0.6905, 0.6905,
+        0.6906, 0.6904, 0.6906, 0.6905, 0.6906, 0.6960, 0.6903, 0.6907, 0.6905,
+        0.6906, 0.6905, 0.6907, 0.6907, 0.6907], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6911, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6894, 0.6894, 0.6889, 0.6894, 0.6889, 0.6975, 0.6893, 0.6889, 0.6891,
+        0.6892, 0.6896, 0.6973, 0.6976, 0.6890, 0.6893, 0.6889, 0.6888, 0.6889,
+        0.6894, 0.6888, 0.6974, 0.6974, 0.6890, 0.6896, 0.6972, 0.6887, 0.6892,
+        0.6889, 0.6890, 0.6888, 0.6893, 0.6892], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6909, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6876, 0.6878, 0.6880, 0.6876, 0.6877, 0.6878, 0.6880, 0.6989, 0.6879,
+        0.6873, 0.6884, 0.6877, 0.6876, 0.6876, 0.6879, 0.6877, 0.6876, 0.6882,
+        0.6875, 0.6875, 0.6877, 0.6881, 0.6877, 0.6991, 0.6880, 0.6875, 0.6881,
+        0.6987, 0.6874, 0.6877, 0.6880, 0.6884], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6893, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6862, 0.6858, 0.6861, 0.7007, 0.6857, 0.6864, 0.6866, 0.6860, 0.6861,
+        0.6862, 0.6862, 0.6863, 0.6864, 0.6863, 0.6857, 0.6863, 0.6859, 0.6862,
+        0.6857, 0.7008, 0.6860, 0.6869, 0.6860, 0.7005, 0.6861, 0.6857, 0.6866,
+        0.7008, 0.6857, 0.6856, 0.6855, 0.6864], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6884, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6848, 0.7023, 0.7028, 0.6848, 0.6849, 0.6848, 0.7029, 0.6839, 0.6837,
+        0.6838, 0.6837, 0.6848, 0.6839, 0.6842, 0.6838, 0.6844, 0.6843, 0.6847,
+        0.6836, 0.6840, 0.7021, 0.6838, 0.6836, 0.6836, 0.7027, 0.6849, 0.7023,
+        0.7023, 0.6842, 0.6848, 0.6843, 0.6844], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6887, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6833, 0.6820, 0.6809, 0.7045, 0.6832, 0.6825, 0.6827, 0.6829, 0.7038,
+        0.7045, 0.6832, 0.6833, 0.6839, 0.6829, 0.6834, 0.6840, 0.6829, 0.7037,
+        0.6833, 0.6829, 0.6827, 0.6828, 0.6829, 0.6821, 0.6829, 0.6834, 0.6828,
+        0.6823, 0.6835, 0.6822, 0.6825, 0.6829], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6863, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[0.5123, 0.4877],
+        [0.5117, 0.4883],
+        [0.5118, 0.4882],
+        [0.5118, 0.4882],
+        [0.5124, 0.4876],
+        [0.5122, 0.4878],
+        [0.5119, 0.4881],
+        [0.5127, 0.4873],
+        [0.5123, 0.4877],
+        [0.5121, 0.4879],
+        [0.5120, 0.4880],
+        [0.5121, 0.4879],
+        [0.5122, 0.4878],
+        [0.5125, 0.4875],
+        [0.5128, 0.4872],
+        [0.5126, 0.4874],
+        [0.5124, 0.4876],
+        [0.5124, 0.4876],
+        [0.5124, 0.4876],
+        [0.5124, 0.4876],
+        [0.5123, 0.4877],
+        [0.5122, 0.4878],
+        [0.5120, 0.4880],
+        [0.5124, 0.4876],
+        [0.5120, 0.4880],
+        [0.5119, 0.4881],
+        [0.5117, 0.4883],
+        [0.5119, 0.4881],
+        [0.5118, 0.4882],
+        [0.5119, 0.4881],
+        [0.5117, 0.4883],
+        [0.5124, 0.4876]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(0.7054, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.6801, 0.6811, 0.6807, 0.6810, 0.6798, 0.6802, 0.6811, 0.6811, 0.6799,
+        0.6807, 0.7065, 0.6818, 0.6802, 0.6824, 0.6808, 0.6806, 0.6810, 0.6797,
+        0.6803, 0.6806, 0.7064, 0.6804, 0.6800, 0.7054, 0.6813, 0.6809, 0.6811,
+        0.6816, 0.6820, 0.6803, 0.6800, 0.6813], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932, 0.6932,
+        0.6932, 0.6932, 0.6932, 0.6932, 0.6932], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.6841, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.4440, 0.4431, 0.4423, 1.0512, 0.4354, 0.4477, 0.4348, 0.4372, 0.4326,
+        0.4353, 0.4288, 1.0622, 0.4329, 0.4263, 0.4323, 0.4431, 0.4356, 0.4367,
+        0.4443, 0.4378, 0.4332, 0.4242, 0.4547, 0.4266, 0.4205, 0.4291, 0.4446,
+        0.4455, 0.4256, 0.4609, 0.4424, 0.4271], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.7349, 0.7352, 0.7356, 0.7406, 0.7383, 0.7334, 0.7386, 0.7376, 0.7395,
+        0.7384, 0.7412, 0.7432, 0.7394, 0.7422, 0.7396, 0.7352, 0.7383, 0.7378,
+        0.7348, 0.7373, 0.7393, 0.7432, 0.7308, 0.7421, 0.7449, 0.7410, 0.7346,
+        0.7343, 0.7426, 0.7285, 0.7355, 0.7419], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5018, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.2373, 0.3479, 1.2251, 0.3466, 0.3493, 0.3496, 0.3549, 0.3486, 0.3413,
+        0.3564, 0.3540, 0.3417, 0.3526, 0.3504, 0.3545, 0.3465, 0.3580, 0.3578,
+        0.3433, 0.3538, 0.3425, 0.3502, 1.2251, 1.2219, 0.3512, 0.3503, 0.3460,
+        0.3590, 0.3446, 0.3562, 0.3484, 1.2262], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.7900, 0.7863, 0.7864, 0.7872, 0.7854, 0.7852, 0.7815, 0.7858, 0.7910,
+        0.7805, 0.7821, 0.7907, 0.7831, 0.7846, 0.7818, 0.7873, 0.7795, 0.7796,
+        0.7896, 0.7823, 0.7901, 0.7847, 0.7864, 0.7855, 0.7841, 0.7847, 0.7877,
+        0.7788, 0.7887, 0.7806, 0.7860, 0.7868], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5170, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3278, 0.3256, 0.3231, 0.3233, 0.3251, 0.3253, 0.3262, 0.3278, 0.3243,
+        0.3225, 0.3259, 0.3254, 0.3295, 0.3260, 0.3256, 0.3262, 0.3250, 0.3260,
+        0.3236, 0.3283, 0.3268, 0.3250, 0.3323, 0.3233, 0.3267, 0.3259, 0.3223,
+        0.3228, 0.3229, 0.3225, 0.3246, 0.3310], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8013, 0.8030, 0.8051, 0.8049, 0.8034, 0.8033, 0.8026, 0.8013, 0.8041,
+        0.8055, 0.8028, 0.8032, 0.8000, 0.8027, 0.8031, 0.8026, 0.8035, 0.8027,
+        0.8046, 0.8009, 0.8021, 0.8035, 0.7978, 0.8049, 0.8022, 0.8028, 0.8057,
+        0.8053, 0.8052, 0.8056, 0.8038, 0.7988], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3733, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3171, 0.3167, 0.3177, 0.3159, 0.3175, 1.3035, 0.3192, 0.3186, 0.3171,
+        0.3171, 0.3168, 0.3180, 0.3172, 1.3040, 0.3188, 0.3205, 0.3203, 0.3178,
+        1.3052, 0.3161, 0.3166, 0.3167, 1.3039, 1.3045, 0.3166, 0.3163, 1.3048,
+        1.3016, 1.3018, 0.3161, 0.3171, 0.3150], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8100, 0.8104, 0.8095, 0.8110, 0.8096, 0.8102, 0.8083, 0.8087, 0.8100,
+        0.8100, 0.8103, 0.8092, 0.8099, 0.8103, 0.8086, 0.8071, 0.8073, 0.8094,
+        0.8107, 0.8108, 0.8104, 0.8104, 0.8103, 0.8105, 0.8104, 0.8107, 0.8106,
+        0.8096, 0.8097, 0.8108, 0.8100, 0.8117], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5885, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3142, 0.3149, 0.3155, 0.3143, 1.3105, 0.3147, 1.3086, 0.3151, 1.3093,
+        1.3094, 0.3152, 0.3146, 0.3145, 0.3144, 0.3151, 0.3150, 1.3094, 0.3147,
+        0.3158, 0.3148, 0.3149, 0.3150, 0.3157, 0.3144, 0.3144, 0.3155, 0.3151,
+        0.3148, 0.3144, 0.3150, 0.3159, 0.3147], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8125, 0.8118, 0.8114, 0.8123, 0.8124, 0.8120, 0.8118, 0.8117, 0.8120,
+        0.8120, 0.8116, 0.8122, 0.8122, 0.8122, 0.8117, 0.8118, 0.8121, 0.8120,
+        0.8111, 0.8120, 0.8119, 0.8118, 0.8112, 0.8123, 0.8123, 0.8113, 0.8117,
+        0.8119, 0.8123, 0.8118, 0.8110, 0.8120], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5045, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3139, 0.3137, 0.3137, 0.3138, 0.3138, 1.3116, 0.3138, 0.3139, 0.3140,
+        0.3139, 1.3113, 0.3137, 0.3137, 0.3140, 0.3139, 0.3138, 0.3143, 0.3143,
+        0.3139, 0.3140, 0.3138, 0.3138, 0.3138, 0.3140, 0.3139, 1.3122, 0.3137,
+        0.3138, 0.3141, 0.3138, 1.3120, 0.3139], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8127, 0.8129, 0.8129, 0.8128, 0.8128, 0.8127, 0.8128, 0.8127, 0.8126,
+        0.8127, 0.8126, 0.8129, 0.8129, 0.8126, 0.8127, 0.8128, 0.8124, 0.8124,
+        0.8127, 0.8127, 0.8128, 0.8128, 0.8128, 0.8126, 0.8127, 0.8129, 0.8129,
+        0.8128, 0.8126, 0.8128, 0.8129, 0.8127], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4760, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3135, 0.3134, 0.3135, 0.3136, 0.3136, 0.3135, 0.3139, 0.3135, 0.3134,
+        0.3134, 0.3134, 0.3135, 0.3136, 0.3135, 0.3134, 0.3135, 1.3126, 0.3137,
+        0.3135, 0.3138, 0.3134, 0.3135, 0.3138, 0.3134, 0.3135, 0.3136, 0.3134,
+        0.3135, 1.3127, 0.3135, 0.3134, 0.3135], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8130, 0.8131, 0.8131, 0.8130, 0.8130, 0.8131, 0.8127, 0.8131, 0.8131,
+        0.8131, 0.8131, 0.8131, 0.8130, 0.8131, 0.8131, 0.8131, 0.8131, 0.8129,
+        0.8131, 0.8128, 0.8131, 0.8130, 0.8128, 0.8131, 0.8131, 0.8130, 0.8131,
+        0.8131, 0.8131, 0.8130, 0.8131, 0.8131], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4197, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3135, 0.3134, 0.3134, 0.3133, 0.3134, 1.3129, 1.3130, 0.3133, 0.3135,
+        0.3133, 0.3134, 0.3134, 0.3134, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3130, 0.3134, 0.3133, 0.3134, 0.3134, 0.3134, 0.3134, 0.3133, 0.3133,
+        0.3134, 0.3134, 0.3134, 0.3134, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8131, 0.8132, 0.8132, 0.8132, 0.8131, 0.8131, 0.8132, 0.8132, 0.8130,
+        0.8132, 0.8131, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132,
+        0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8131, 0.8131, 0.8132, 0.8132,
+        0.8131, 0.8131, 0.8131, 0.8132, 0.8132], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4477, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[9.9987e-01, 1.3297e-04],
+        [9.9981e-01, 1.9078e-04],
+        [9.9982e-01, 1.7943e-04],
+        [9.9982e-01, 1.7781e-04],
+        [9.9988e-01, 1.2462e-04],
+        [9.9986e-01, 1.3894e-04],
+        [9.9983e-01, 1.7157e-04],
+        [9.9989e-01, 1.0836e-04],
+        [9.9987e-01, 1.3451e-04],
+        [9.9985e-01, 1.4906e-04],
+        [9.9984e-01, 1.6087e-04],
+        [9.9984e-01, 1.5916e-04],
+        [9.9985e-01, 1.4523e-04],
+        [9.9988e-01, 1.1763e-04],
+        [9.9990e-01, 9.5136e-05],
+        [9.9989e-01, 1.0943e-04],
+        [9.9987e-01, 1.3151e-04],
+        [9.9987e-01, 1.2863e-04],
+        [9.9988e-01, 1.2315e-04],
+        [9.9988e-01, 1.2338e-04],
+        [9.9986e-01, 1.3644e-04],
+        [9.9986e-01, 1.4150e-04],
+        [9.9984e-01, 1.6069e-04],
+        [9.9988e-01, 1.2354e-04],
+        [9.9984e-01, 1.5766e-04],
+        [9.9983e-01, 1.7292e-04],
+        [9.9981e-01, 1.9048e-04],
+        [9.9982e-01, 1.7739e-04],
+        [9.9981e-01, 1.8760e-04],
+        [9.9983e-01, 1.7279e-04],
+        [9.9981e-01, 1.9314e-04],
+        [9.9987e-01, 1.2820e-04]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3130, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3131, 1.3131, 0.3133, 0.3133, 0.3133, 1.3130, 0.3133, 0.3133, 0.3134,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3134, 0.3133, 0.3133, 1.3131, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3134, 0.3133, 0.3133, 0.3133, 1.3131], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8131,
+        0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132,
+        0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132,
+        0.8132, 0.8132, 0.8132, 0.8132, 0.8132], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 1.3132, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3132, 0.3133, 0.3133,
+        0.3133, 1.3132, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8132, 0.8133, 0.8132,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8132, 0.8132, 0.8132, 0.8133, 0.8133,
+        0.8132, 0.8133, 0.8133, 0.8132, 0.8133, 0.8132, 0.8132, 0.8132, 0.8133,
+        0.8132, 0.8133, 0.8133, 0.8132, 0.8132], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3132,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8132, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5601, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3633, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4283e-08],
+        [1.0000e+00, 6.0943e-08],
+        [1.0000e+00, 5.2414e-08],
+        [1.0000e+00, 5.2234e-08],
+        [1.0000e+00, 2.9134e-08],
+        [1.0000e+00, 3.5439e-08],
+        [1.0000e+00, 5.2968e-08],
+        [1.0000e+00, 2.4628e-08],
+        [1.0000e+00, 3.3891e-08],
+        [1.0000e+00, 3.9833e-08],
+        [1.0000e+00, 4.6143e-08],
+        [1.0000e+00, 4.5376e-08],
+        [1.0000e+00, 3.9571e-08],
+        [1.0000e+00, 2.7552e-08],
+        [1.0000e+00, 1.9134e-08],
+        [1.0000e+00, 2.4356e-08],
+        [1.0000e+00, 3.3188e-08],
+        [1.0000e+00, 3.1166e-08],
+        [1.0000e+00, 2.8297e-08],
+        [1.0000e+00, 2.8663e-08],
+        [1.0000e+00, 3.4569e-08],
+        [1.0000e+00, 3.7047e-08],
+        [1.0000e+00, 4.4655e-08],
+        [1.0000e+00, 2.7564e-08],
+        [1.0000e+00, 4.3321e-08],
+        [1.0000e+00, 5.1786e-08],
+        [1.0000e+00, 6.0503e-08],
+        [1.0000e+00, 5.4386e-08],
+        [1.0000e+00, 5.9361e-08],
+        [1.0000e+00, 5.2789e-08],
+        [1.0000e+00, 6.3719e-08],
+        [1.0000e+00, 3.1184e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4280e-08],
+        [1.0000e+00, 6.0938e-08],
+        [1.0000e+00, 5.2409e-08],
+        [1.0000e+00, 5.2230e-08],
+        [1.0000e+00, 2.9132e-08],
+        [1.0000e+00, 3.5436e-08],
+        [1.0000e+00, 5.2964e-08],
+        [1.0000e+00, 2.4626e-08],
+        [1.0000e+00, 3.3888e-08],
+        [1.0000e+00, 3.9830e-08],
+        [1.0000e+00, 4.6140e-08],
+        [1.0000e+00, 4.5372e-08],
+        [1.0000e+00, 3.9568e-08],
+        [1.0000e+00, 2.7549e-08],
+        [1.0000e+00, 1.9132e-08],
+        [1.0000e+00, 2.4354e-08],
+        [1.0000e+00, 3.3185e-08],
+        [1.0000e+00, 3.1164e-08],
+        [1.0000e+00, 2.8295e-08],
+        [1.0000e+00, 2.8660e-08],
+        [1.0000e+00, 3.4566e-08],
+        [1.0000e+00, 3.7044e-08],
+        [1.0000e+00, 4.4652e-08],
+        [1.0000e+00, 2.7561e-08],
+        [1.0000e+00, 4.3318e-08],
+        [1.0000e+00, 5.1782e-08],
+        [1.0000e+00, 6.0497e-08],
+        [1.0000e+00, 5.4381e-08],
+        [1.0000e+00, 5.9356e-08],
+        [1.0000e+00, 5.2785e-08],
+        [1.0000e+00, 6.3714e-08],
+        [1.0000e+00, 3.1181e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4279e-08],
+        [1.0000e+00, 6.0936e-08],
+        [1.0000e+00, 5.2408e-08],
+        [1.0000e+00, 5.2228e-08],
+        [1.0000e+00, 2.9131e-08],
+        [1.0000e+00, 3.5435e-08],
+        [1.0000e+00, 5.2962e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3888e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6138e-08],
+        [1.0000e+00, 4.5371e-08],
+        [1.0000e+00, 3.9567e-08],
+        [1.0000e+00, 2.7549e-08],
+        [1.0000e+00, 1.9132e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3184e-08],
+        [1.0000e+00, 3.1163e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8660e-08],
+        [1.0000e+00, 3.4565e-08],
+        [1.0000e+00, 3.7043e-08],
+        [1.0000e+00, 4.4650e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1780e-08],
+        [1.0000e+00, 6.0495e-08],
+        [1.0000e+00, 5.4379e-08],
+        [1.0000e+00, 5.9354e-08],
+        [1.0000e+00, 5.2783e-08],
+        [1.0000e+00, 6.3712e-08],
+        [1.0000e+00, 3.1180e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4565e-08],
+        [1.0000e+00, 3.7043e-08],
+        [1.0000e+00, 4.4650e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0495e-08],
+        [1.0000e+00, 5.4379e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4564e-08],
+        [1.0000e+00, 3.7043e-08],
+        [1.0000e+00, 4.4649e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0495e-08],
+        [1.0000e+00, 5.4378e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        1.3133, 1.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 1.3133, 1.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4564e-08],
+        [1.0000e+00, 3.7042e-08],
+        [1.0000e+00, 4.4649e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0494e-08],
+        [1.0000e+00, 5.4378e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4564e-08],
+        [1.0000e+00, 3.7042e-08],
+        [1.0000e+00, 4.4649e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0494e-08],
+        [1.0000e+00, 5.4378e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 1.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4758, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4564e-08],
+        [1.0000e+00, 3.7042e-08],
+        [1.0000e+00, 4.4649e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0494e-08],
+        [1.0000e+00, 5.4378e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
+------Training-------
+batch= 0
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 1.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 1
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 2
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.3914, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 3
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4195, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 4
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        1.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 5
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 6
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 1.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 1.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5320, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 7
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 1.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.5039, device='cuda:0', grad_fn=<MeanBackward0>)
+batch= 8
+torch.Size([32, 16, 256])
+torch.Size([32, 2])
+torch.Size([32])
+LabelSmoothingCrossEntropy()
+confidence, nll_loss, self.smoothing, smooth_loss
+0.9 tensor([0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 0.3133, 1.3133, 0.3133, 0.3133, 0.3133, 0.3133, 0.3133,
+        0.3133, 0.3133, 1.3133, 0.3133, 0.3133], device='cuda:0',
+       grad_fn=<SqueezeBackward1>) 0.1 tensor([0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133, 0.8133,
+        0.8133, 0.8133, 0.8133, 0.8133, 0.8133], device='cuda:0',
+       grad_fn=<NegBackward0>)
+loss= tensor(0.4476, device='cuda:0', grad_fn=<MeanBackward0>)
+------Validating-------
+output = tensor([[1.0000e+00, 3.4278e-08],
+        [1.0000e+00, 6.0935e-08],
+        [1.0000e+00, 5.2407e-08],
+        [1.0000e+00, 5.2227e-08],
+        [1.0000e+00, 2.9130e-08],
+        [1.0000e+00, 3.5434e-08],
+        [1.0000e+00, 5.2961e-08],
+        [1.0000e+00, 2.4625e-08],
+        [1.0000e+00, 3.3887e-08],
+        [1.0000e+00, 3.9828e-08],
+        [1.0000e+00, 4.6137e-08],
+        [1.0000e+00, 4.5370e-08],
+        [1.0000e+00, 3.9566e-08],
+        [1.0000e+00, 2.7548e-08],
+        [1.0000e+00, 1.9131e-08],
+        [1.0000e+00, 2.4353e-08],
+        [1.0000e+00, 3.3183e-08],
+        [1.0000e+00, 3.1162e-08],
+        [1.0000e+00, 2.8294e-08],
+        [1.0000e+00, 2.8659e-08],
+        [1.0000e+00, 3.4564e-08],
+        [1.0000e+00, 3.7042e-08],
+        [1.0000e+00, 4.4649e-08],
+        [1.0000e+00, 2.7560e-08],
+        [1.0000e+00, 4.3316e-08],
+        [1.0000e+00, 5.1779e-08],
+        [1.0000e+00, 6.0494e-08],
+        [1.0000e+00, 5.4378e-08],
+        [1.0000e+00, 5.9353e-08],
+        [1.0000e+00, 5.2782e-08],
+        [1.0000e+00, 6.3711e-08],
+        [1.0000e+00, 3.1179e-08]], device='cuda:0')
+target= tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
+eval_loss= tensor(1.3133, device='cuda:0')
+eval_acc1= tensor(0., device='cuda:0')
diff --git a/timm/data/textdataset.py b/timm/data/textdataset.py
index a1861d2d..30f11309 100644
--- a/timm/data/textdataset.py
+++ b/timm/data/textdataset.py
@@ -39,14 +39,19 @@ class TextDataset(Dataset):
         All_Videos.sort()
         #print(All_Videos)
         VideoPath = os.path.join(self.path, All_Videos[idx//32])
+        #print(VideoPath)
         f = open(VideoPath, "r")
         feat = idx%32
         words = f.read().split()
         features = np.float32(words[feat * 4096:feat * 4096 + 4096])
         features = torch.tensor(features)
-        # features = torch.reshape(features, (16, 256))
+        #print(features.shape)
+        if len(features) == 0:
+            print(idx)
+            print(VideoPath)
+        features = torch.reshape(features, (16, 256))
         # features = torch.reshape(features, (196, 768))
-        features = torch.reshape(features, (1, 4096))
+        #features = torch.reshape(features, (1, 4096))
         #print(VideoPath)
         if VideoPath.find('Normal') == -1:
             label = 0
@@ -54,9 +59,9 @@ class TextDataset(Dataset):
             label = 1
 
         label = torch.tensor(label)
-        print(features.shape)
+        #print(features.shape)
         #print(features)
-        print(label.shape)
+        #print(label.shape)
         #print(label)
 
         return features, label
diff --git a/timm/models/mlp_mixer.py b/timm/models/mlp_mixer.py
index 0444c59a..5632a537 100644
--- a/timm/models/mlp_mixer.py
+++ b/timm/models/mlp_mixer.py
@@ -63,7 +63,7 @@ def _cfg(url='', **kwargs):
 
 
 default_cfgs = dict(
-    mixer_s32_224=_cfg(),
+    mixer_s32_224=_cfg(num_classes=2),
     mixer_s16_224=_cfg(),
     mixer_b32_224=_cfg(),
     mixer_b16_224=_cfg(
@@ -264,12 +264,13 @@ class MlpMixer(nn.Module):
         super().__init__()
         self.num_classes = num_classes
         self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
-        self.initial_fc =nn.Linear(4096, 150528)
-        self.stem = PatchEmbed(
-            img_size=img_size, patch_size=patch_size, in_chans=in_chans,
-            embed_dim=embed_dim, norm_layer=norm_layer if stem_norm else None)
+        
+        ##initial_fc and stem not needed
+        #self.initial_fc =nn.Linear(4096, 150528)
+        #self.stem = PatchEmbed(
+        #    img_size=img_size, patch_size=patch_size, in_chans=in_chans,
+        #    embed_dim=embed_dim, norm_layer=norm_layer if stem_norm else None)
         # FIXME drop_path (stochastic depth scaling rule or all the same?)
-        #embed_dim=256
         #print("num_classes:",self.num_classes, "embed_dim:", embed_dim)
         self.blocks = nn.Sequential(*[
             block_layer(
@@ -286,23 +287,24 @@ class MlpMixer(nn.Module):
             for _ in range(num_blocks)])
         """
         self.norm = norm_layer(embed_dim)
-        # self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
-        self.head = nn.Sequential(
-            nn.Linear(embed_dim, self.num_classes),
-            nn.ReLU(),
-            nn.Dropout(p=0.3),
-            nn.Linear(self.num_classes, 1024),
-            nn.ReLU(),
-            nn.Dropout(p=0.3),
-            nn.Linear(1024, 512),
-            nn.ReLU(),
-            nn.Dropout(p=0.3),
-            nn.Linear(512, 256),
-            nn.ReLU(),
-            nn.Dropout(p=0.3),
-            nn.Linear(256, 2)
-        )
-        self.sigmoid = nn.Sigmoid()
+        self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity()
+        # self.head = nn.Sequential(
+        #     nn.Linear(embed_dim, self.num_classes),
+        #     nn.ReLU(),
+        #     nn.Dropout(p=0.3),
+        #     nn.Linear(self.num_classes, 1024),
+        #     nn.ReLU(),
+        #     nn.Dropout(p=0.3),
+        #     nn.Linear(1024, 512),
+        #     nn.ReLU(),
+        #     nn.Dropout(p=0.3),
+        #     nn.Linear(512, 256),
+        #     nn.ReLU(),
+        #     nn.Dropout(p=0.3),
+        #     nn.Linear(256, 2)
+        # )
+        #self.sigmoid = nn.Sigmoid()
+        self.sm = nn.Softmax(dim=1)
         self.init_weights(nlhb=nlhb)
 
     def init_weights(self, nlhb=False):
@@ -318,23 +320,24 @@ class MlpMixer(nn.Module):
 
     def forward_features(self, x):
         #x = self.stem(x)
-        print("In_Model")
+        #print("In_Model")
         x = self.blocks(x)
-        print(x)
+        #print(x)
         x = self.norm(x)
-        print(x)
+        #print(x)
         x = x.mean(dim=1)
-        print(x)
+        #print(x)
         return x
 
     def forward(self, x):
-        x = self.initial_fc(x)
-        x = torch.reshape(x, (196, 768))
+        #x = self.initial_fc(x)
+        #x = torch.reshape(x, (196, 768))
         x = self.forward_features(x)
         x = self.head(x)
-        print(x)
-        x = self.sigmoid(x)
-        print(x)
+        #print(x)
+        #x = self.sigmoid(x)
+        #print(x)
+        x = self.sm(x)
         return x
 
 
@@ -413,7 +416,8 @@ def mixer_s32_224(pretrained=False, **kwargs):
     """ Mixer-S/32 224x224
     Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601
     """
-    model_args = dict(patch_size=32, num_blocks=8, embed_dim=512, **kwargs)
+    #model_args = dict(patch_size=32, num_blocks=8, embed_dim=512, **kwargs)
+    model_args = dict(patch_size=16, num_blocks=8, embed_dim=256, **kwargs)
     model = _create_mixer('mixer_s32_224', pretrained=pretrained, **model_args)
     return model
 
diff --git a/train.py b/train.py
index b9f38ced..c9116938 100644
--- a/train.py
+++ b/train.py
@@ -679,10 +679,11 @@ def train_one_epoch(
     data_time_m = AverageMeter()
     losses_m = AverageMeter()
 
+    print("------Training-------")
     model.train()
 
     end = time.time()
-    print("loader_length=",len(loader))
+    #print("loader_length=",len(loader))
     last_idx = len(loader) - 1
     num_updates = epoch * len(loader)
     for batch_idx, (input, target) in enumerate(loader):
@@ -698,13 +699,15 @@ def train_one_epoch(
 
         with amp_autocast():
             #print(model)
+            print(input.shape)
             output = model(input)
             print(output.shape)
             print(target.shape)
             #print(output)
             #print(target)
-            #print(loss_fn)
+            print(loss_fn)
             loss = loss_fn(output, target)
+            print("loss=", loss)
 
         if not args.distributed:
             losses_m.update(loss.item(), input.size(0))
@@ -785,6 +788,7 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
     top1_m = AverageMeter()
     top5_m = AverageMeter()
 
+    print("------Validating-------")
     model.eval()
 
     end = time.time()
@@ -809,8 +813,12 @@ def validate(model, loader, loss_fn, args, amp_autocast=suppress, log_suffix='')
                 output = output.unfold(0, reduce_factor, reduce_factor).mean(dim=2)
                 target = target[0:target.size(0):reduce_factor]
 
+            print("output =", output)
+            print("target=", target)
             loss = loss_fn(output, target)
+            print("eval_loss=", loss)
             acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            print("eval_acc1=", acc1)
 
             if args.distributed:
                 reduced_loss = reduce_tensor(loss.data, args.world_size)