Merge pull request #155 from rwightman/densenet_update_and_more

DenseNet updates, EvoNorms, VovNet, activation factory and more. Includes PR #142
5 years ago · d1b5dddad1
parent 5966654052 39f27c1add
commit d1b5dddad1
59 changed files with 2450 additions and 615 deletions
--- a/README.md
+++ b/README.md
@ -2,6 +2,19 @@

 ## What's New

+### June 11, 2020
+Bunch of changes:
+* DenseNet models updated with memory efficient addition from torchvision (fixed a bug), blur pooling and deep stem additions
+* VoVNet V1 and V2 models added, 39 V2 variant (ese_vovnet_39b) trained to 79.3 top-1
+* Activation factory added along with new activations:
+   * select act at model creation time for more flexibility in using activations compatible with scripting or tracing (ONNX export)
+   * hard_mish (experimental) added with memory-efficient grad, along with ME hard_swish
+   * context mgr for setting exportable/scriptable/no_jit states
+* Norm + Activation combo layers added with initial trial support in DenseNet and VoVNet along with impl of EvoNorm and InplaceAbn wrapper that fit the interface
+* Torchscript works for all but two of the model types as long as using Pytorch 1.5+, tests added for this
+* Some import cleanup and classifier reset changes, all models will have classifier reset to nn.Identity on reset_classifer(0) call
+* Prep for 0.1.28 pip release
+
 ### May 12, 2020
 * Add ResNeSt models (code adapted from https://github.com/zhanghang1989/ResNeSt, paper https://arxiv.org/abs/2004.08955))

--- a/results/results-imagenet-a.csv
+++ b/results/results-imagenet-a.csv
@ -14,7 +14,9 @@ tf_efficientnet_b8,29.3733,70.6267,57.0667,42.9333,87.41,672,0.954,bicubic
 ig_resnext101_32x8d,28.7067,71.2933,52.32,47.68,88.79,224,0.875,bilinear
 swsl_resnext101_32x16d,27.9467,72.0533,52.32,47.68,194.03,224,0.875,bilinear
 tf_efficientnet_b7_ap,27.8133,72.1867,54.7733,45.2267,66.35,600,0.949,bicubic
+resnest269e,27.6133,72.3867,53.1067,46.8933,110.93,416,0.875,bilinear
 tresnet_xl_448,26.88,73.12,51.0933,48.9067,78.44,448,0.875,bilinear
+resnest200e,26.4267,73.5733,51.9333,48.0667,70.2,320,0.875,bilinear
 swsl_resnext101_32x4d,25.3467,74.6533,49.6267,50.3733,44.18,224,0.875,bilinear
 tf_efficientnet_b7,25.2533,74.7467,51.6667,48.3333,66.35,600,0.949,bicubic
 tresnet_l_448,24.5733,75.4267,48.6,51.4,55.99,448,0.875,bilinear
@ -26,6 +28,7 @@ tf_efficientnet_b3_ns,19.4133,80.5867,44.6267,55.3733,12.23,300,0.904,bicubic
 swsl_resnext50_32x4d,18.0667,81.9333,41.8667,58.1333,25.03,224,0.875,bilinear
 ssl_resnext101_32x16d,17.2133,82.7867,39.9467,60.0533,194.03,224,0.875,bilinear
 tf_efficientnet_b5,17.0667,82.9333,41.9067,58.0933,30.39,456,0.934,bicubic
+resnest101e,16.4933,83.5067,40.7467,59.2533,48.28,256,0.875,bilinear
 swsl_resnet50,15.9867,84.0133,38.8533,61.1467,25.56,224,0.875,bilinear
 ssl_resnext101_32x8d,15.12,84.88,37.72,62.28,88.79,224,0.875,bilinear
 tf_efficientnet_b4_ap,13.68,86.32,35.92,64.08,19.34,380,0.922,bicubic
@ -36,14 +39,16 @@ nasnetalarge,12.5733,87.4267,33.4133,66.5867,88.75,331,0.875,bicubic
 ssl_resnext101_32x4d,12.12,87.88,31.8933,68.1067,44.18,224,0.875,bilinear
 tf_efficientnet_b2_ns,11.7867,88.2133,32.96,67.04,9.11,260,0.89,bicubic
 gluon_senet154,9.9067,90.0933,26.4533,73.5467,115.09,224,0.875,bicubic
+resnest50d_4s2x40d,9.7867,90.2133,29.1467,70.8533,30.42,224,0.875,bicubic
 ssl_resnext50_32x4d,9.6667,90.3333,28.4267,71.5733,25.03,224,0.875,bilinear
 senet154,9.4533,90.5467,26.44,73.56,115.09,224,0.875,bilinear
 tresnet_xl,9.3067,90.6933,28.4133,71.5867,78.44,224,0.875,bilinear
-efficientnet_b3a,9.2667,90.7333,28.4267,71.5733,12.23,320,1,bicubic
+efficientnet_b3a,9.2667,90.7333,28.4267,71.5733,12.23,320,1.0,bicubic
 efficientnet_b3,8.9467,91.0533,28.2133,71.7867,12.23,300,0.904,bicubic
 inception_v4,8.92,91.08,24.7067,75.2933,42.68,299,0.875,bicubic
 gluon_seresnext101_64x4d,8.8667,91.1333,27.32,72.68,88.23,224,0.875,bicubic
 tf_efficientnet_b1_ns,8.6133,91.3867,27.28,72.72,7.79,240,0.882,bicubic
+resnest50d_1s4x24d,8.52,91.48,26.7867,73.2133,25.68,224,0.875,bicubic
 ecaresnet50d,8.5067,91.4933,26.2667,73.7333,25.58,224,0.875,bicubic
 gluon_xception65,8.4667,91.5333,25.1333,74.8667,39.92,299,0.875,bicubic
 gluon_resnet152_v1d,8.4133,91.5867,23.4533,76.5467,60.21,224,0.875,bicubic
@ -55,23 +60,30 @@ ens_adv_inception_resnet_v2,7.9867,92.0133,23.8267,76.1733,55.84,299,0.8975,bicu
 tf_efficientnet_lite4,7.9333,92.0667,25.56,74.44,13.01,380,0.92,bilinear
 tresnet_l,7.88,92.12,25.1867,74.8133,55.99,224,0.875,bilinear
 gluon_resnet152_v1s,7.8667,92.1333,23.1733,76.8267,60.32,224,0.875,bicubic
+resnest50d,7.7467,92.2533,25.2933,74.7067,27.48,224,0.875,bilinear
 gluon_resnext101_64x4d,7.7067,92.2933,23.24,76.76,83.46,224,0.875,bicubic
 skresnext50_32x4d,7.08,92.92,23.0267,76.9733,27.48,224,0.875,bicubic
-ssl_resnet50,7,93,23.92,76.08,25.56,224,0.875,bilinear
+ssl_resnet50,7.0,93.0,23.92,76.08,25.56,224,0.875,bilinear
+regnety_320,6.92,93.08,23.04,76.96,145.05,224,0.875,bicubic
 ecaresnet101d_pruned,6.8,93.2,24.2,75.8,24.88,224,0.875,bicubic
 ecaresnetlight,6.76,93.24,22.56,77.44,30.16,224,0.875,bicubic
-efficientnet_b2a,6.76,93.24,23.4933,76.5067,9.11,288,1,bicubic
+efficientnet_b2a,6.76,93.24,23.4933,76.5067,9.11,288,1.0,bicubic
 seresnext101_32x4d,6.4133,93.5867,21.52,78.48,48.96,224,0.875,bilinear
 efficientnet_b2,6.0933,93.9067,21.9333,78.0667,9.11,260,0.875,bicubic
 gluon_resnext101_32x4d,6.04,93.96,21.1333,78.8667,44.18,224,0.875,bicubic
+regnetx_320,5.9867,94.0133,19.88,80.12,107.81,224,0.875,bicubic
+ese_vovnet39b,5.9733,94.0267,21.2933,78.7067,24.57,224,0.875,bicubic
 gluon_resnet101_v1d,5.92,94.08,19.9467,80.0533,44.57,224,0.875,bicubic
 gluon_seresnext50_32x4d,5.7867,94.2133,21.4267,78.5733,27.56,224,0.875,bicubic
 efficientnet_b3_pruned,5.7333,94.2667,21.36,78.64,9.86,300,0.904,bicubic
+regnety_160,5.64,94.36,19.3467,80.6533,83.59,224,0.875,bicubic
 gluon_inception_v3,5.5067,94.4933,19.9467,80.0533,23.83,299,0.875,bicubic
 mixnet_xl,5.48,94.52,21.0933,78.9067,11.9,224,0.875,bicubic
 tresnet_m,5.44,94.56,19.96,80.04,31.39,224,0.875,bilinear
+regnety_120,5.4133,94.5867,19.8533,80.1467,51.82,224,0.875,bicubic
 gluon_resnet101_v1s,5.28,94.72,19.5467,80.4533,44.67,224,0.875,bicubic
 hrnet_w64,5.1333,94.8667,19.4533,80.5467,128.06,224,0.875,bilinear
+regnety_080,5.0,95.0,18.6,81.4,39.18,224,0.875,bicubic
 efficientnet_b2_pruned,4.9467,95.0533,19.3467,80.6533,8.31,260,0.89,bicubic
 dpn107,4.88,95.12,17.6133,82.3867,86.92,224,0.875,bicubic
 gluon_resnet152_v1c,4.8667,95.1333,17.7733,82.2267,60.21,224,0.875,bicubic
@ -84,38 +96,45 @@ gluon_resnet152_v1b,4.5867,95.4133,16.5333,83.4667,60.19,224,0.875,bicubic
 ecaresnet50d_pruned,4.5467,95.4533,18.5467,81.4533,19.94,224,0.875,bicubic
 dpn92,4.4933,95.5067,18.2,81.8,37.67,224,0.875,bicubic
 hrnet_w44,4.4933,95.5067,17.3467,82.6533,67.06,224,0.875,bilinear
+regnetx_160,4.3733,95.6267,17.0933,82.9067,54.28,224,0.875,bicubic
 resnext50d_32x4d,4.3467,95.6533,17.7733,82.2267,25.05,224,0.875,bicubic
 xception,4.3467,95.6533,16.76,83.24,22.86,299,0.8975,bicubic
 seresnext50_32x4d,4.28,95.72,17.8133,82.1867,27.56,224,0.875,bilinear
 resnext50_32x4d,4.2533,95.7467,18.3867,81.6133,25.03,224,0.875,bicubic
 tf_efficientnet_cc_b1_8e,4.24,95.76,15.9467,84.0533,39.72,240,0.882,bicubic
+regnety_064,4.2267,95.7733,17.1867,82.8133,30.58,224,0.875,bicubic
 tf_efficientnet_el,4.2267,95.7733,18.1733,81.8267,10.59,300,0.904,bicubic
-inception_v3,4.2,95.8,16.2933,83.7067,27.16,299,0.875,bicubic
+inception_v3,4.1867,95.8133,16.2933,83.7067,23.83,299,0.875,bicubic
 tf_efficientnet_b2_ap,4.1733,95.8267,18.32,81.68,9.11,260,0.89,bicubic
 seresnet152,4.1467,95.8533,15.8933,84.1067,66.82,224,0.875,bilinear
 resnext101_32x8d,4.1333,95.8667,16.9867,83.0133,88.79,224,0.875,bilinear
 tf_efficientnet_b0_ns,4.1333,95.8667,17.68,82.32,5.29,224,0.875,bicubic
 dpn98,4.08,95.92,15.9467,84.0533,61.57,224,0.875,bicubic
-res2net101_26w_4s,4,96,14.8267,85.1733,45.21,224,0.875,bilinear
+res2net101_26w_4s,4.0,96.0,14.8267,85.1733,45.21,224,0.875,bilinear
 efficientnet_b1,3.9733,96.0267,15.76,84.24,7.79,240,0.875,bicubic
 tf_efficientnet_lite3,3.9333,96.0667,16.52,83.48,8.2,300,0.904,bilinear
 tf_efficientnet_b2,3.7733,96.2267,16.6133,83.3867,9.11,260,0.89,bicubic
+regnety_040,3.7467,96.2533,16.4,83.6,20.65,224,0.875,bicubic
 hrnet_w30,3.68,96.32,15.5733,84.4267,37.71,224,0.875,bilinear
 hrnet_w32,3.6533,96.3467,14.7867,85.2133,41.23,224,0.875,bilinear
 hrnet_w40,3.6533,96.3467,15.44,84.56,57.56,224,0.875,bilinear
+regnetx_120,3.6267,96.3733,15.9733,84.0267,46.11,224,0.875,bicubic
 seresnext26t_32x4d,3.6133,96.3867,15.8933,84.1067,16.82,224,0.875,bicubic
 tf_efficientnet_b1_ap,3.5467,96.4533,15.0667,84.9333,7.79,240,0.882,bicubic
 seresnext26tn_32x4d,3.5067,96.4933,15.76,84.24,16.81,224,0.875,bicubic
+resnest26d,3.4933,96.5067,15.6667,84.3333,17.07,224,0.875,bilinear
 dla169,3.4667,96.5333,15.3333,84.6667,53.99,224,0.875,bilinear
 gluon_resnext50_32x4d,3.4533,96.5467,16.12,83.88,25.03,224,0.875,bicubic
 mixnet_l,3.44,96.56,15.3067,84.6933,7.33,224,0.875,bicubic
 seresnext26d_32x4d,3.4,96.6,16.16,83.84,16.81,224,0.875,bicubic
-resnetblur50,3.3333,96.6667,15.5867,84.4133,25.56,224,0.875,bicubic
 res2net50_26w_8s,3.3333,96.6667,14.04,85.96,48.4,224,0.875,bilinear
+resnetblur50,3.3333,96.6667,15.5867,84.4133,25.56,224,0.875,bicubic
 dla102x,3.3067,96.6933,15.12,84.88,26.77,224,0.875,bilinear
 gluon_resnet101_v1c,3.3067,96.6933,14.12,85.88,44.57,224,0.875,bicubic
 seresnet101,3.2533,96.7467,15.4533,84.5467,49.33,224,0.875,bilinear
+densenetblur121d,3.0667,96.9333,14.28,85.72,8.0,224,0.875,bicubic
 dla60_res2next,3.04,96.96,14.4533,85.5467,17.33,224,0.875,bilinear
+regnety_032,3.0267,96.9733,14.24,85.76,19.44,224,0.875,bicubic
 gluon_resnet50_v1d,3.0133,96.9867,14.6267,85.3733,25.58,224,0.875,bicubic
 wide_resnet101_2,2.96,97.04,13.9467,86.0533,126.89,224,0.875,bilinear
 efficientnet_b1_pruned,2.9333,97.0667,14.4133,85.5867,6.33,240,0.882,bicubic
@ -124,6 +143,7 @@ tf_efficientnet_b1,2.8667,97.1333,13.5067,86.4933,7.79,240,0.882,bicubic
 res2net50_26w_6s,2.84,97.16,12.6,87.4,37.05,224,0.875,bilinear
 efficientnet_b0,2.8133,97.1867,13.9067,86.0933,5.29,224,0.875,bicubic
 tf_mixnet_l,2.8133,97.1867,13.04,86.96,7.33,224,0.875,bicubic
+regnetx_064,2.7867,97.2133,13.88,86.12,26.21,224,0.875,bicubic
 dpn68b,2.7067,97.2933,12.64,87.36,12.61,224,0.875,bicubic
 selecsls60b,2.6933,97.3067,13.1733,86.8267,32.77,224,0.875,bicubic
 tf_efficientnet_cc_b0_8e,2.68,97.32,12.7733,87.2267,24.01,224,0.875,bicubic
@ -134,24 +154,30 @@ mixnet_m,2.5467,97.4533,12.4267,87.5733,5.01,224,0.875,bicubic
 skresnet34,2.52,97.48,12.7733,87.2267,22.28,224,0.875,bicubic
 efficientnet_es,2.3733,97.6267,13.88,86.12,5.44,224,0.875,bicubic
 resnet152,2.36,97.64,12.2,87.8,60.19,224,0.875,bilinear
+regnetx_080,2.3467,97.6533,12.6933,87.3067,39.57,224,0.875,bicubic
 swsl_resnet18,2.3333,97.6667,11.2133,88.7867,11.69,224,0.875,bilinear
 wide_resnet50_2,2.32,97.68,11.8,88.2,68.88,224,0.875,bilinear
 seresnext26_32x4d,2.2933,97.7067,12.44,87.56,16.79,224,0.875,bicubic
 hrnet_w18,2.2667,97.7333,11.8533,88.1467,21.3,224,0.875,bilinear
 dla102,2.2533,97.7467,12.12,87.88,33.73,224,0.875,bilinear
 resnet50,2.2267,97.7733,11.3333,88.6667,25.56,224,0.875,bicubic
+regnety_016,2.1733,97.8267,11.44,88.56,11.2,224,0.875,bicubic
+regnetx_040,2.16,97.84,11.8,88.2,22.12,224,0.875,bicubic
+resnest14d,2.1467,97.8533,10.4,89.6,10.61,224,0.875,bilinear
 selecsls60,2.08,97.92,12.84,87.16,30.67,224,0.875,bicubic
 tf_efficientnet_cc_b0_4e,2.08,97.92,10.9733,89.0267,13.31,224,0.875,bicubic
 res2next50,2.0667,97.9333,11.4533,88.5467,24.67,224,0.875,bilinear
 seresnet50,2.0667,97.9333,12.2667,87.7333,28.09,224,0.875,bilinear
 densenet161,1.9733,98.0267,10.5867,89.4133,28.68,224,0.875,bicubic
 tf_efficientnet_b0_ap,1.96,98.04,10.8,89.2,5.29,224,0.875,bicubic
+regnetx_032,1.92,98.08,10.9467,89.0533,15.3,224,0.875,bicubic
 tf_efficientnet_em,1.8133,98.1867,11.6267,88.3733,6.9,240,0.882,bicubic
 tf_mixnet_m,1.8133,98.1867,10.5467,89.4533,5.01,224,0.875,bicubic
 tf_efficientnet_lite2,1.8,98.2,11.1467,88.8533,6.09,260,0.89,bicubic
 res2net50_14w_8s,1.7867,98.2133,10.3467,89.6533,25.06,224,0.875,bilinear
 res2net50_26w_4s,1.7733,98.2267,10.44,89.56,25.7,224,0.875,bilinear
 mobilenetv3_large_100,1.76,98.24,10.2933,89.7067,5.48,224,0.875,bicubic
+densenet121,1.7333,98.2667,10.8533,89.1467,7.98,224,0.875,bicubic
 tf_efficientnet_b0,1.6933,98.3067,9.7333,90.2667,5.29,224,0.875,bicubic
 tv_resnext50_32x4d,1.68,98.32,10.6,89.4,25.03,224,0.875,bilinear
 mobilenetv3_rw,1.6667,98.3333,10.7333,89.2667,5.48,224,0.875,bicubic
@ -163,6 +189,7 @@ gluon_resnet50_v1c,1.5467,98.4533,10.6133,89.3867,25.58,224,0.875,bicubic
 semnasnet_100,1.5467,98.4533,9.32,90.68,3.89,224,0.875,bicubic
 selecsls42b,1.4667,98.5333,10.44,89.56,32.46,224,0.875,bicubic
 tf_efficientnet_lite1,1.4533,98.5467,9.7067,90.2933,5.42,240,0.882,bicubic
+regnety_008,1.4267,98.5733,8.9467,91.0533,6.26,224,0.875,bicubic
 ssl_resnet18,1.3867,98.6133,8.16,91.84,11.69,224,0.875,bilinear
 dla60,1.3467,98.6533,9.4667,90.5333,22.33,224,0.875,bilinear
 dpn68,1.3467,98.6533,8.8133,91.1867,12.61,224,0.875,bicubic
@ -178,19 +205,26 @@ seresnet34,1.12,98.88,7.4,92.6,21.96,224,0.875,bilinear
 tf_efficientnet_es,1.12,98.88,8.6,91.4,5.44,224,0.875,bicubic
 spnasnet_100,1.1067,98.8933,8.2533,91.7467,4.42,224,0.875,bilinear
 tf_efficientnet_lite0,1.1067,98.8933,7.4933,92.5067,4.65,224,0.875,bicubic
+regnetx_016,1.0933,98.9067,8.6267,91.3733,9.19,224,0.875,bicubic
 dla34,1.08,98.92,7.6933,92.3067,15.78,224,0.875,bilinear
+regnety_006,1.0533,98.9467,8.4,91.6,6.06,224,0.875,bicubic
+regnety_004,1.0133,98.9867,7.3333,92.6667,4.34,224,0.875,bicubic
 resnet34,0.9867,99.0133,7.5333,92.4667,21.8,224,0.875,bilinear
 mobilenetv2_110d,0.9333,99.0667,8.1067,91.8933,4.52,224,0.875,bicubic
 gluon_resnet34_v1b,0.8933,99.1067,6.6,93.4,21.8,224,0.875,bicubic
 hrnet_w18_small_v2,0.8933,99.1067,7.3867,92.6133,15.6,224,0.875,bilinear
+regnetx_008,0.8933,99.1067,6.9067,93.0933,7.26,224,0.875,bicubic
 skresnet18,0.88,99.12,7.3867,92.6133,11.96,224,0.875,bicubic
 mnasnet_100,0.8667,99.1333,7.8667,92.1333,4.38,224,0.875,bicubic
 tf_mobilenetv3_large_075,0.8667,99.1333,6.72,93.28,3.99,224,0.875,bilinear
+regnetx_006,0.76,99.24,6.4933,93.5067,6.2,224,0.875,bicubic
 tf_mobilenetv3_small_100,0.7467,99.2533,4.6667,95.3333,2.54,224,0.875,bilinear
 seresnet18,0.72,99.28,6.0267,93.9733,11.78,224,0.875,bicubic
-densenet121,0.68,99.32,6.9067,93.0933,7.98,224,0.875,bicubic
+regnetx_004,0.6933,99.3067,5.5067,94.4933,5.16,224,0.875,bicubic
+tv_densenet121,0.68,99.32,6.9067,93.0933,7.98,224,0.875,bicubic
+regnety_002,0.6667,99.3333,5.5333,94.4667,3.16,224,0.875,bicubic
 tf_mobilenetv3_small_075,0.6267,99.3733,4.1733,95.8267,2.04,224,0.875,bilinear
-resnet26,0.6,99.4,6.88,93.12,16,224,0.875,bicubic
+resnet26,0.6,99.4,6.88,93.12,16.0,224,0.875,bicubic
 tv_resnet34,0.6,99.4,5.52,94.48,21.8,224,0.875,bilinear
 mobilenetv2_100,0.5333,99.4667,6.1867,93.8133,3.5,224,0.875,bicubic
 dla46_c,0.52,99.48,4.1867,95.8133,1.31,224,0.875,bilinear
@ -201,4 +235,5 @@ dla46x_c,0.4133,99.5867,4.44,95.56,1.08,224,0.875,bilinear
 gluon_resnet18_v1b,0.3867,99.6133,4.7867,95.2133,11.69,224,0.875,bicubic
 tf_mobilenetv3_small_minimal_100,0.36,99.64,2.8667,97.1333,2.04,224,0.875,bilinear
 resnet18,0.2933,99.7067,4.04,95.96,11.69,224,0.875,bilinear
-tv_resnet50,0,100,2.8933,97.1067,25.56,224,0.875,bilinear
+regnetx_002,0.2267,99.7733,3.9867,96.0133,2.68,224,0.875,bicubic
+tv_resnet50,0.0,100.0,2.8933,97.1067,25.56,224,0.875,bilinear
--- a/results/results-imagenet.csv
+++ b/results/results-imagenet.csv
@ -14,15 +14,18 @@ tf_efficientnet_b7,84.932,15.068,97.208,2.792,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ap,84.786,15.214,97.138,2.862,43.04,528,0.942,bicubic
 swsl_resnext101_32x8d,84.294,15.706,97.174,2.826,88.79,224,0.875,bilinear
 tf_efficientnet_b5_ap,84.254,15.746,96.976,3.024,30.39,456,0.934,bicubic
+resnest269e,84.186,15.814,96.922,3.078,110.93,416,0.875,bilinear
 ig_resnext101_32x16d,84.176,15.824,97.196,2.804,194.03,224,0.875,bilinear
 tf_efficientnet_b6,84.112,15.888,96.884,3.116,43.04,528,0.942,bicubic
 tf_efficientnet_b3_ns,84.054,15.946,96.912,3.088,12.23,300,0.904,bicubic
+resnest200e,83.834,16.166,96.838,3.162,70.2,320,0.875,bilinear
 tf_efficientnet_b5,83.816,16.184,96.75,3.25,30.39,456,0.934,bicubic
 swsl_resnext101_32x16d,83.338,16.662,96.852,3.148,194.03,224,0.875,bilinear
 tf_efficientnet_b4_ap,83.248,16.752,96.388,3.612,19.34,380,0.922,bicubic
 swsl_resnext101_32x4d,83.234,16.766,96.756,3.244,44.18,224,0.875,bilinear
 tresnet_xl_448,83.048,16.952,96.174,3.826,78.44,448,0.875,bilinear
 tf_efficientnet_b4,83.016,16.984,96.298,3.702,19.34,380,0.922,bicubic
+resnest101e,82.89,17.11,96.324,3.676,48.28,256,0.875,bilinear
 pnasnet5large,82.74,17.26,96.04,3.96,86.06,331,0.875,bicubic
 ig_resnext101_32x8d,82.688,17.312,96.632,3.368,88.79,224,0.875,bilinear
 nasnetalarge,82.558,17.442,96.036,3.964,88.75,331,0.875,bicubic
@ -31,7 +34,7 @@ tresnet_l_448,82.268,17.732,95.978,4.022,55.99,448,0.875,bilinear
 swsl_resnext50_32x4d,82.18,17.82,96.228,3.772,25.03,224,0.875,bilinear
 ecaresnet101d,82.166,17.834,96.052,3.948,44.57,224,0.875,bicubic
 tresnet_xl,82.07,17.93,95.928,4.072,78.44,224,0.875,bilinear
-efficientnet_b3a,81.874,18.126,95.84,4.16,12.23,320,1,bicubic
+efficientnet_b3a,81.874,18.126,95.84,4.16,12.23,320,1.0,bicubic
 ssl_resnext101_32x16d,81.836,18.164,96.094,3.906,194.03,224,0.875,bilinear
 tf_efficientnet_b3_ap,81.828,18.172,95.624,4.376,12.23,300,0.904,bicubic
 tresnet_m_448,81.712,18.288,95.57,4.43,31.39,448,0.875,bilinear
@ -44,14 +47,18 @@ tf_efficientnet_b1_ns,81.386,18.614,95.738,4.262,7.79,240,0.882,bicubic
 senet154,81.304,18.696,95.498,4.502,115.09,224,0.875,bilinear
 gluon_senet154,81.224,18.776,95.356,4.644,115.09,224,0.875,bicubic
 swsl_resnet50,81.18,18.82,95.986,4.014,25.56,224,0.875,bilinear
+resnest50d_4s2x40d,81.114,18.886,95.568,4.432,30.42,224,0.875,bicubic
 gluon_resnet152_v1s,81.012,18.988,95.416,4.584,60.32,224,0.875,bicubic
+resnest50d_1s4x24d,80.99,19.01,95.322,4.678,25.68,224,0.875,bicubic
+resnest50d,80.958,19.042,95.382,4.618,27.48,224,0.875,bilinear
 ssl_resnext101_32x4d,80.928,19.072,95.728,4.272,44.18,224,0.875,bilinear
 gluon_seresnext101_32x4d,80.902,19.098,95.294,4.706,48.96,224,0.875,bicubic
 gluon_seresnext101_64x4d,80.89,19.11,95.304,4.696,88.23,224,0.875,bicubic
 efficientnet_b3_pruned,80.856,19.144,95.24,4.76,9.86,300,0.904,bicubic
+regnety_320,80.814,19.186,95.24,4.76,145.05,224,0.875,bicubic
 ecaresnet101d_pruned,80.808,19.192,95.628,4.372,24.88,224,0.875,bicubic
 tresnet_m,80.796,19.204,94.856,5.144,31.39,224,0.875,bilinear
-efficientnet_b2a,80.608,19.392,95.31,4.69,9.11,288,1,bicubic
+efficientnet_b2a,80.608,19.392,95.31,4.69,9.11,288,1.0,bicubic
 ecaresnet50d,80.604,19.396,95.322,4.678,25.58,224,0.875,bicubic
 gluon_resnext101_64x4d,80.602,19.398,94.994,5.006,83.46,224,0.875,bicubic
 mixnet_xl,80.478,19.522,94.932,5.068,11.9,224,0.875,bicubic
@ -61,10 +68,13 @@ ecaresnetlight,80.454,19.546,95.256,4.744,30.16,224,0.875,bicubic
 tf_efficientnet_el,80.448,19.552,95.16,4.84,10.59,300,0.904,bicubic
 gluon_resnet101_v1d,80.424,19.576,95.02,4.98,44.57,224,0.875,bicubic
 efficientnet_b2,80.402,19.598,95.076,4.924,9.11,260,0.875,bicubic
+regnety_120,80.382,19.618,95.128,4.872,51.82,224,0.875,bicubic
 gluon_resnext101_32x4d,80.334,19.666,94.926,5.074,44.18,224,0.875,bicubic
 ssl_resnext50_32x4d,80.328,19.672,95.404,4.596,25.03,224,0.875,bilinear
 tf_efficientnet_b2_ap,80.306,19.694,95.028,4.972,9.11,260,0.89,bicubic
 gluon_resnet101_v1s,80.3,19.7,95.15,4.85,44.67,224,0.875,bicubic
+regnety_160,80.3,19.7,94.962,5.038,83.59,224,0.875,bicubic
+regnetx_320,80.246,19.754,95.022,4.978,107.81,224,0.875,bicubic
 seresnext101_32x4d,80.236,19.764,95.028,4.972,48.96,224,0.875,bilinear
 dpn107,80.164,19.836,94.912,5.088,86.92,224,0.875,bicubic
 inception_v4,80.156,19.844,94.974,5.026,42.68,299,0.875,bicubic
@ -75,18 +85,23 @@ ens_adv_inception_resnet_v2,79.976,20.024,94.946,5.054,55.84,299,0.8975,bicubic
 efficientnet_b2_pruned,79.918,20.082,94.848,5.152,8.31,260,0.89,bicubic
 gluon_resnet152_v1c,79.916,20.084,94.842,5.158,60.21,224,0.875,bicubic
 gluon_seresnext50_32x4d,79.912,20.088,94.818,5.182,27.56,224,0.875,bicubic
+regnety_080,79.868,20.132,94.832,5.168,39.18,224,0.875,bicubic
+regnetx_160,79.866,20.134,94.828,5.172,54.28,224,0.875,bicubic
 dpn131,79.828,20.172,94.704,5.296,79.25,224,0.875,bicubic
 tf_efficientnet_lite3,79.812,20.188,94.914,5.086,8.2,300,0.904,bilinear
 resnext50_32x4d,79.762,20.238,94.6,5.4,25.03,224,0.875,bicubic
 ecaresnet50d_pruned,79.718,20.282,94.89,5.11,19.94,224,0.875,bicubic
+regnety_064,79.712,20.288,94.774,5.226,30.58,224,0.875,bicubic
 gluon_resnet152_v1b,79.692,20.308,94.738,5.262,60.19,224,0.875,bicubic
 resnext50d_32x4d,79.674,20.326,94.868,5.132,25.05,224,0.875,bicubic
 dpn98,79.636,20.364,94.594,5.406,61.57,224,0.875,bicubic
 gluon_xception65,79.604,20.396,94.748,5.252,39.92,299,0.875,bicubic
+regnetx_120,79.59,20.41,94.74,5.26,46.11,224,0.875,bicubic
 gluon_resnet101_v1c,79.544,20.456,94.586,5.414,44.57,224,0.875,bicubic
 hrnet_w64,79.472,20.528,94.65,5.35,128.06,224,0.875,bilinear
 dla102x2,79.452,20.548,94.644,5.356,41.75,224,0.875,bilinear
 gluon_resnext50_32x4d,79.356,20.644,94.424,5.576,25.03,224,0.875,bicubic
+ese_vovnet39b,79.32,20.68,94.71,5.29,24.57,224,0.875,bicubic
 resnext101_32x8d,79.312,20.688,94.526,5.474,88.79,224,0.875,bilinear
 hrnet_w48,79.31,20.69,94.518,5.482,77.47,224,0.875,bilinear
 gluon_resnet101_v1b,79.304,20.696,94.524,5.476,44.55,224,0.875,bicubic
@ -94,15 +109,19 @@ tf_efficientnet_cc_b1_8e,79.298,20.702,94.364,5.636,39.72,240,0.882,bicubic
 resnetblur50,79.29,20.71,94.632,5.368,25.56,224,0.875,bicubic
 tf_efficientnet_b1_ap,79.278,20.722,94.308,5.692,7.79,240,0.882,bicubic
 ssl_resnet50,79.228,20.772,94.832,5.168,25.56,224,0.875,bilinear
+regnety_040,79.222,20.778,94.656,5.344,20.65,224,0.875,bicubic
 res2net50_26w_8s,79.21,20.79,94.362,5.638,48.4,224,0.875,bilinear
+regnetx_080,79.198,20.802,94.558,5.442,39.57,224,0.875,bicubic
 res2net101_26w_4s,79.196,20.804,94.44,5.56,45.21,224,0.875,bilinear
 seresnext50_32x4d,79.076,20.924,94.434,5.566,27.56,224,0.875,bilinear
 gluon_resnet50_v1d,79.074,20.926,94.476,5.524,25.58,224,0.875,bicubic
+regnetx_064,79.066,20.934,94.456,5.544,26.21,224,0.875,bicubic
 xception,79.048,20.952,94.392,5.608,22.86,299,0.8975,bicubic
 resnet50,79.032,20.968,94.384,5.616,25.56,224,0.875,bicubic
 mixnet_l,78.976,21.024,94.184,5.816,7.33,224,0.875,bicubic
 hrnet_w40,78.934,21.066,94.466,5.534,57.56,224,0.875,bilinear
 hrnet_w44,78.894,21.106,94.37,5.63,67.06,224,0.875,bilinear
+regnety_032,78.87,21.13,94.402,5.598,19.44,224,0.875,bicubic
 wide_resnet101_2,78.846,21.154,94.284,5.716,126.89,224,0.875,bilinear
 tf_efficientnet_b1,78.832,21.168,94.196,5.804,7.79,240,0.882,bicubic
 gluon_inception_v3,78.804,21.196,94.38,5.62,23.83,299,0.875,bicubic
@ -115,6 +134,8 @@ seresnet152,78.658,21.342,94.374,5.626,66.82,224,0.875,bilinear
 tf_efficientnet_b0_ns,78.652,21.348,94.368,5.632,5.29,224,0.875,bicubic
 res2net50_26w_6s,78.574,21.426,94.126,5.874,37.05,224,0.875,bilinear
 dla102x,78.508,21.492,94.234,5.766,26.77,224,0.875,bilinear
+regnetx_040,78.486,21.514,94.242,5.758,22.12,224,0.875,bicubic
+resnest26d,78.482,21.518,94.29,5.71,17.07,224,0.875,bilinear
 dla60_res2net,78.472,21.528,94.204,5.796,21.15,224,0.875,bilinear
 wide_resnet50_2,78.468,21.532,94.086,5.914,68.88,224,0.875,bilinear
 dla60_res2next,78.448,21.552,94.144,5.856,17.33,224,0.875,bilinear
@ -122,10 +143,11 @@ hrnet_w32,78.448,21.552,94.188,5.812,41.23,224,0.875,bilinear
 selecsls60b,78.418,21.582,94.166,5.834,32.77,224,0.875,bicubic
 seresnet101,78.396,21.604,94.258,5.742,49.33,224,0.875,bilinear
 resnet152,78.312,21.688,94.046,5.954,60.19,224,0.875,bilinear
-efficientnet_b1_pruned,78.242,21.758,93.832,6.168,6.33,240,0.882,bicubic
 dla60x,78.242,21.758,94.022,5.978,17.65,224,0.875,bilinear
+efficientnet_b1_pruned,78.242,21.758,93.832,6.168,6.33,240,0.882,bicubic
 res2next50,78.242,21.758,93.892,6.108,24.67,224,0.875,bilinear
-hrnet_w30,78.196,21.804,94.218,5.782,37.71,224,0.875,bilinear
+hrnet_w30,78.196,21.804,94.22,5.78,37.71,224,0.875,bilinear
+regnetx_032,78.166,21.834,94.08,5.92,15.3,224,0.875,bicubic
 res2net50_14w_8s,78.152,21.848,93.842,6.158,25.06,224,0.875,bilinear
 efficientnet_es,78.054,21.946,93.93,6.07,5.44,224,0.875,bicubic
 dla102,78.026,21.974,93.95,6.05,33.73,224,0.875,bilinear
@ -136,16 +158,17 @@ selecsls60,77.982,22.018,93.832,6.168,30.67,224,0.875,bicubic
 res2net50_26w_4s,77.946,22.054,93.852,6.148,25.7,224,0.875,bilinear
 tf_efficientnet_cc_b0_8e,77.908,22.092,93.656,6.344,24.01,224,0.875,bicubic
 tf_inception_v3,77.856,22.144,93.644,6.356,23.83,299,0.875,bicubic
+regnety_016,77.852,22.148,93.716,6.284,11.2,224,0.875,bicubic
 efficientnet_b0,77.692,22.308,93.532,6.468,5.29,224,0.875,bicubic
 seresnet50,77.636,22.364,93.752,6.248,28.09,224,0.875,bilinear
 tv_resnext50_32x4d,77.618,22.382,93.698,6.302,25.03,224,0.875,bilinear
 seresnext26d_32x4d,77.604,22.396,93.612,6.388,16.81,224,0.875,bicubic
+adv_inception_v3,77.58,22.42,93.724,6.276,23.83,299,0.875,bicubic
 gluon_resnet50_v1b,77.578,22.422,93.718,6.282,25.56,224,0.875,bicubic
-adv_inception_v3,77.576,22.424,93.724,6.276,23.83,299,0.875,bicubic
 dpn68b,77.514,22.486,93.822,6.178,12.61,224,0.875,bicubic
 res2net50_48w_2s,77.514,22.486,93.548,6.452,25.29,224,0.875,bilinear
 tf_efficientnet_lite2,77.46,22.54,93.746,6.254,6.09,260,0.89,bicubic
-inception_v3,77.434,22.566,93.478,6.522,27.16,299,0.875,bicubic
+inception_v3,77.436,22.564,93.476,6.524,23.83,299,0.875,bicubic
 resnet101,77.374,22.626,93.546,6.454,44.55,224,0.875,bilinear
 densenet161,77.348,22.652,93.648,6.352,28.68,224,0.875,bicubic
 tf_efficientnet_cc_b0_4e,77.304,22.696,93.332,6.668,13.31,224,0.875,bicubic
@ -156,46 +179,58 @@ mixnet_m,77.256,22.744,93.418,6.582,5.01,224,0.875,bicubic
 selecsls42b,77.176,22.824,93.392,6.608,32.46,224,0.875,bicubic
 seresnext26_32x4d,77.1,22.9,93.31,6.69,16.79,224,0.875,bicubic
 tf_efficientnet_b0_ap,77.084,22.916,93.254,6.746,5.29,224,0.875,bicubic
-dla60,77.022,22.978,93.308,6.692,22.33,224,0.875,bilinear
+dla60,77.024,22.976,93.308,6.692,22.33,224,0.875,bilinear
 tf_mixnet_m,76.95,23.05,93.156,6.844,5.01,224,0.875,bicubic
+regnetx_016,76.93,23.07,93.418,6.582,9.19,224,0.875,bicubic
 skresnet34,76.91,23.09,93.316,6.684,22.28,224,0.875,bicubic
 tf_efficientnet_b0,76.84,23.16,93.226,6.774,5.29,224,0.875,bicubic
 hrnet_w18,76.756,23.244,93.442,6.558,21.3,224,0.875,bilinear
 resnet26d,76.68,23.32,93.166,6.834,16.01,224,0.875,bicubic
 tf_efficientnet_lite1,76.638,23.362,93.232,6.768,5.42,240,0.882,bicubic
+densenetblur121d,76.576,23.424,93.19,6.81,8.0,224,0.875,bicubic
 mobilenetv2_140,76.524,23.476,92.99,7.01,6.11,224,0.875,bicubic
+regnety_008,76.314,23.686,93.062,6.938,6.26,224,0.875,bicubic
 dpn68,76.306,23.694,92.97,7.03,12.61,224,0.875,bicubic
 tv_resnet50,76.13,23.87,92.862,7.138,25.56,224,0.875,bilinear
 mixnet_s,75.988,24.012,92.794,7.206,4.13,224,0.875,bicubic
 densenet169,75.912,24.088,93.024,6.976,14.15,224,0.875,bicubic
 mobilenetv3_large_100,75.768,24.232,92.54,7.46,5.48,224,0.875,bicubic
 tf_mixnet_s,75.648,24.352,92.636,7.364,4.13,224,0.875,bicubic
-mobilenetv3_rw,75.628,24.372,92.708,7.292,5.48,224,0.875,bicubic
+mobilenetv3_rw,75.628,24.372,92.71,7.29,5.48,224,0.875,bicubic
+densenet121,75.574,24.426,92.656,7.344,7.98,224,0.875,bicubic
 tf_mobilenetv3_large_100,75.516,24.484,92.6,7.4,5.48,224,0.875,bilinear
+resnest14d,75.504,24.496,92.514,7.486,10.61,224,0.875,bilinear
 semnasnet_100,75.456,24.544,92.592,7.408,3.89,224,0.875,bicubic
-resnet26,75.292,24.708,92.57,7.43,16,224,0.875,bicubic
+resnet26,75.292,24.708,92.57,7.43,16.0,224,0.875,bicubic
+regnety_006,75.26,24.74,92.528,7.472,6.06,224,0.875,bicubic
 hrnet_w18_small_v2,75.126,24.874,92.416,7.584,15.6,224,0.875,bilinear
 fbnetc_100,75.12,24.88,92.386,7.614,5.57,224,0.875,bilinear
 resnet34,75.112,24.888,92.288,7.712,21.8,224,0.875,bilinear
 mobilenetv2_110d,75.052,24.948,92.18,7.82,4.52,224,0.875,bicubic
+regnetx_008,75.022,24.978,92.344,7.656,7.26,224,0.875,bicubic
 tf_efficientnet_lite0,74.842,25.158,92.17,7.83,4.65,224,0.875,bicubic
 seresnet34,74.808,25.192,92.126,7.874,21.96,224,0.875,bilinear
-densenet121,74.752,25.248,92.152,7.848,7.98,224,0.875,bicubic
+tv_densenet121,74.752,25.248,92.152,7.848,7.98,224,0.875,bicubic
 mnasnet_100,74.656,25.344,92.126,7.874,4.38,224,0.875,bicubic
 dla34,74.636,25.364,92.064,7.936,15.78,224,0.875,bilinear
 gluon_resnet34_v1b,74.58,25.42,91.988,8.012,21.8,224,0.875,bicubic
 spnasnet_100,74.08,25.92,91.832,8.168,4.42,224,0.875,bilinear
+regnety_004,74.026,25.974,91.748,8.252,4.34,224,0.875,bicubic
+regnetx_006,73.862,26.138,91.68,8.32,6.2,224,0.875,bicubic
 tf_mobilenetv3_large_075,73.442,26.558,91.352,8.648,3.99,224,0.875,bilinear
 tv_resnet34,73.314,26.686,91.42,8.58,21.8,224,0.875,bilinear
 swsl_resnet18,73.286,26.714,91.732,8.268,11.69,224,0.875,bilinear
 skresnet18,73.044,26.956,91.178,8.822,11.96,224,0.875,bicubic
 mobilenetv2_100,72.978,27.022,91.016,8.984,3.5,224,0.875,bicubic
-ssl_resnet18,72.6,27.4,91.418,8.582,11.69,224,0.875,bilinear
+ssl_resnet18,72.6,27.4,91.416,8.584,11.69,224,0.875,bilinear
+regnetx_004,72.406,27.594,90.83,9.17,5.16,224,0.875,bicubic
 hrnet_w18_small,72.342,27.658,90.672,9.328,13.19,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,72.244,27.756,90.636,9.364,3.92,224,0.875,bilinear
 seresnet18,71.758,28.242,90.334,9.666,11.78,224,0.875,bicubic
 gluon_resnet18_v1b,70.83,29.17,89.756,10.244,11.69,224,0.875,bicubic
+regnety_002,70.282,29.718,89.54,10.46,3.16,224,0.875,bicubic
 resnet18,69.758,30.242,89.078,10.922,11.69,224,0.875,bilinear
+regnetx_002,68.754,31.246,88.548,11.452,2.68,224,0.875,bicubic
 tf_mobilenetv3_small_100,67.918,32.082,87.662,12.338,2.54,224,0.875,bilinear
 dla60x_c,67.908,32.092,88.434,11.566,1.34,224,0.875,bilinear
 dla46x_c,65.98,34.02,86.98,13.02,1.08,224,0.875,bilinear
--- a/results/results-imagenetv2-matched-frequency.csv
+++ b/results/results-imagenetv2-matched-frequency.csv
@ -17,12 +17,15 @@ tf_efficientnet_b7,74.72,25.28,92.22,7.78,66.35,600,0.949,bicubic
 tf_efficientnet_b5_ap,74.59,25.41,91.99,8.01,30.39,456,0.934,bicubic
 swsl_resnext101_32x4d,74.15,25.85,91.99,8.01,44.18,224,0.875,bilinear
 swsl_resnext101_32x16d,74.01,25.99,92.17,7.83,194.03,224,0.875,bilinear
+resnest200e,73.93,26.07,91.58,8.42,70.2,320,0.875,bilinear
 tf_efficientnet_b6,73.9,26.1,91.75,8.25,43.04,528,0.942,bicubic
 tf_efficientnet_b3_ns,73.87,26.13,91.86,8.14,12.23,300,0.904,bicubic
 ig_resnext101_32x8d,73.66,26.34,92.15,7.85,88.79,224,0.875,bilinear
 tf_efficientnet_b5,73.54,26.46,91.46,8.54,30.39,456,0.934,bicubic
+resnest269e,73.46,26.54,91.68,8.32,110.93,416,0.875,bilinear
 tf_efficientnet_b4_ap,72.89,27.11,90.98,9.02,19.34,380,0.922,bicubic
 swsl_resnext50_32x4d,72.58,27.42,90.84,9.16,25.03,224,0.875,bilinear
+resnest101e,72.55,27.45,90.81,9.19,48.28,256,0.875,bilinear
 tresnet_xl_448,72.55,27.45,90.31,9.69,78.44,448,0.875,bilinear
 pnasnet5large,72.37,27.63,90.26,9.74,86.06,331,0.875,bicubic
 nasnetalarge,72.31,27.69,90.51,9.49,88.75,331,0.875,bicubic
@ -34,9 +37,10 @@ tresnet_l_448,71.6,28.4,90.06,9.94,55.99,448,0.875,bilinear
 ecaresnet101d,71.5,28.5,90.31,9.69,44.57,224,0.875,bicubic
 ssl_resnext101_32x8d,71.49,28.51,90.47,9.53,88.79,224,0.875,bilinear
 ssl_resnext101_32x16d,71.4,28.6,90.55,9.45,194.03,224,0.875,bilinear
-tresnet_m_448,71,29,88.68,11.32,31.39,448,0.875,bilinear
+tresnet_m_448,71.0,29.0,88.68,11.32,31.39,448,0.875,bilinear
+resnest50d_4s2x40d,70.94,29.06,89.71,10.29,30.42,224,0.875,bicubic
 tf_efficientnet_b3_ap,70.92,29.08,89.43,10.57,12.23,300,0.904,bicubic
-efficientnet_b3a,70.87,29.13,89.72,10.28,12.23,320,1,bicubic
+efficientnet_b3a,70.87,29.13,89.72,10.28,12.23,320,1.0,bicubic
 tf_efficientnet_b1_ns,70.85,29.15,90.11,9.89,7.79,240,0.882,bicubic
 tresnet_l,70.83,29.17,89.61,10.39,55.99,224,0.875,bilinear
 efficientnet_b3,70.76,29.24,89.84,10.16,12.23,300,0.904,bicubic
@ -45,7 +49,9 @@ gluon_senet154,70.6,29.4,88.92,11.08,115.09,224,0.875,bicubic
 ssl_resnext101_32x4d,70.5,29.5,89.76,10.24,44.18,224,0.875,bilinear
 senet154,70.48,29.52,88.99,11.01,115.09,224,0.875,bilinear
 gluon_seresnext101_64x4d,70.44,29.56,89.35,10.65,88.23,224,0.875,bicubic
+resnest50d_1s4x24d,70.43,29.57,89.24,10.76,25.68,224,0.875,bicubic
 tf_efficientnet_lite4,70.43,29.57,89.12,10.88,13.01,380,0.92,bilinear
+resnest50d,70.42,29.58,88.76,11.24,27.48,224,0.875,bilinear
 gluon_resnet152_v1s,70.32,29.68,88.87,11.13,60.32,224,0.875,bicubic
 ecaresnet101d_pruned,70.12,29.88,89.58,10.42,24.88,224,0.875,bicubic
 inception_resnet_v2,70.12,29.88,88.68,11.32,55.84,299,0.8975,bicubic
@ -54,16 +60,16 @@ gluon_resnet152_v1d,69.95,30.05,88.47,11.53,60.21,224,0.875,bicubic
 ecaresnet50d,69.83,30.17,89.37,10.63,25.58,224,0.875,bicubic
 gluon_resnext101_64x4d,69.69,30.31,88.26,11.74,83.46,224,0.875,bicubic
 ssl_resnext50_32x4d,69.69,30.31,89.42,10.58,25.03,224,0.875,bilinear
-tresnet_m,69.65,30.35,88,12,31.39,224,0.875,bilinear
+tresnet_m,69.65,30.35,88.0,12.0,31.39,224,0.875,bilinear
 efficientnet_b3_pruned,69.58,30.42,88.97,11.03,9.86,300,0.904,bicubic
 ens_adv_inception_resnet_v2,69.52,30.48,88.5,11.5,55.84,299,0.8975,bicubic
-efficientnet_b2a,69.49,30.51,88.68,11.32,9.11,288,1,bicubic
+efficientnet_b2a,69.49,30.51,88.68,11.32,9.11,288,1.0,bicubic
 inception_v4,69.35,30.65,88.78,11.22,42.68,299,0.875,bicubic
 seresnext101_32x4d,69.34,30.66,88.05,11.95,48.96,224,0.875,bilinear
 ecaresnetlight,69.33,30.67,89.22,10.78,30.16,224,0.875,bicubic
 gluon_resnet152_v1c,69.13,30.87,87.89,12.11,60.21,224,0.875,bicubic
 mixnet_xl,69.08,30.92,88.31,11.69,11.9,224,0.875,bicubic
-efficientnet_b2,69,31,88.62,11.38,9.11,260,0.875,bicubic
+efficientnet_b2,69.0,31.0,88.62,11.38,9.11,260,0.875,bicubic
 gluon_resnet101_v1d,68.99,31.01,88.08,11.92,44.57,224,0.875,bicubic
 gluon_xception65,68.98,31.02,88.32,11.68,39.92,299,0.875,bicubic
 gluon_resnext101_32x4d,68.96,31.04,88.34,11.66,44.18,224,0.875,bicubic
@ -85,6 +91,7 @@ dla102x2,68.34,31.66,87.87,12.13,41.75,224,0.875,bilinear
 efficientnet_b2_pruned,68.3,31.7,88.1,11.9,8.31,260,0.89,bicubic
 gluon_resnext50_32x4d,68.28,31.72,87.32,12.68,25.03,224,0.875,bicubic
 tf_efficientnet_lite3,68.23,31.77,87.72,12.28,8.2,300,0.904,bilinear
+ese_vovnet39b,68.19,31.81,88.26,11.74,24.57,224,0.875,bicubic
 tf_efficientnet_el,68.18,31.82,88.35,11.65,10.59,300,0.904,bicubic
 dpn92,68.01,31.99,87.59,12.41,37.67,224,0.875,bicubic
 gluon_resnet50_v1d,67.91,32.09,87.12,12.88,25.58,224,0.875,bicubic
@ -104,15 +111,16 @@ tf_efficientnet_b1_ap,67.52,32.48,87.77,12.23,7.79,240,0.882,bicubic
 tf_efficientnet_cc_b1_8e,67.48,32.52,87.31,12.69,39.72,240,0.882,bicubic
 gluon_resnet101_v1b,67.45,32.55,87.23,12.77,44.55,224,0.875,bicubic
 res2net101_26w_4s,67.45,32.55,87.01,12.99,45.21,224,0.875,bilinear
-resnetblur50,67.44,32.56,87.43,12.57,25.56,224,0.875,bicubic
 resnet50,67.44,32.56,87.42,12.58,25.56,224,0.875,bicubic
+resnetblur50,67.44,32.56,87.43,12.57,25.56,224,0.875,bicubic
+resnest26d,67.21,32.79,87.18,12.82,17.07,224,0.875,bilinear
 efficientnet_b1,67.16,32.84,87.15,12.85,7.79,240,0.875,bicubic
 seresnet101,67.15,32.85,87.05,12.95,49.33,224,0.875,bilinear
 gluon_resnet50_v1s,67.1,32.9,86.86,13.14,25.68,224,0.875,bicubic
 dla60x,67.08,32.92,87.17,12.83,17.65,224,0.875,bilinear
 dla60_res2net,67.03,32.97,87.14,12.86,21.15,224,0.875,bilinear
 resnet152,67.02,32.98,87.57,12.43,60.19,224,0.875,bilinear
-dla102x,67,33,86.77,13.23,26.77,224,0.875,bilinear
+dla102x,67.0,33.0,86.77,13.23,26.77,224,0.875,bilinear
 mixnet_l,66.97,33.03,86.94,13.06,7.33,224,0.875,bicubic
 res2net50_26w_6s,66.91,33.09,86.9,13.1,37.05,224,0.875,bilinear
 efficientnet_es,66.89,33.11,86.73,13.27,5.44,224,0.875,bicubic
@ -128,14 +136,14 @@ dla60_res2next,66.64,33.36,87.02,12.98,17.33,224,0.875,bilinear
 adv_inception_v3,66.6,33.4,86.56,13.44,23.83,299,0.875,bicubic
 dla102,66.55,33.45,86.91,13.09,33.73,224,0.875,bilinear
 gluon_resnet50_v1c,66.54,33.46,86.16,13.84,25.58,224,0.875,bicubic
-tf_inception_v3,66.41,33.59,86.68,13.32,23.83,299,0.875,bicubic
+tf_inception_v3,66.42,33.58,86.68,13.32,23.83,299,0.875,bicubic
 efficientnet_b0,66.25,33.75,85.95,14.05,5.29,224,0.875,bicubic
 seresnet50,66.24,33.76,86.33,13.67,28.09,224,0.875,bilinear
 selecsls60,66.22,33.78,86.33,13.67,30.67,224,0.875,bicubic
 tf_efficientnet_cc_b0_8e,66.21,33.79,86.22,13.78,24.01,224,0.875,bicubic
 tv_resnext50_32x4d,66.18,33.82,86.04,13.96,25.03,224,0.875,bilinear
 res2net50_26w_4s,66.17,33.83,86.6,13.4,25.7,224,0.875,bilinear
-inception_v3,66.12,33.88,86.34,13.66,27.16,299,0.875,bicubic
+inception_v3,66.12,33.88,86.34,13.66,23.83,299,0.875,bicubic
 efficientnet_b1_pruned,66.08,33.92,86.58,13.42,6.33,240,0.882,bicubic
 gluon_resnet50_v1b,66.04,33.96,86.27,13.73,25.56,224,0.875,bicubic
 res2net50_14w_8s,66.02,33.98,86.24,13.76,25.06,224,0.875,bilinear
@ -151,6 +159,7 @@ tf_efficientnet_b0_ap,65.49,34.51,85.55,14.45,5.29,224,0.875,bicubic
 seresnext26d_32x4d,65.42,34.58,85.97,14.03,16.81,224,0.875,bicubic
 tf_efficientnet_lite2,65.39,34.61,86.03,13.97,6.09,260,0.89,bicubic
 res2net50_48w_2s,65.32,34.68,85.96,14.04,25.29,224,0.875,bilinear
+densenetblur121d,65.3,34.7,85.71,14.29,8.0,224,0.875,bicubic
 densenet201,65.28,34.72,85.67,14.33,20.01,224,0.875,bicubic
 tf_efficientnet_es,65.24,34.76,85.54,14.46,5.44,224,0.875,bicubic
 dla60,65.22,34.78,85.75,14.25,22.33,224,0.875,bilinear
@ -166,21 +175,23 @@ tf_efficientnet_b0,64.29,35.71,85.25,14.75,5.29,224,0.875,bicubic
 tf_mixnet_m,64.27,35.73,85.09,14.91,5.01,224,0.875,bicubic
 dpn68,64.22,35.78,85.18,14.82,12.61,224,0.875,bicubic
 mobilenetv2_140,64.05,35.95,85.02,14.98,6.11,224,0.875,bicubic
+densenet121,63.74,36.26,84.63,15.37,7.98,224,0.875,bicubic
+resnest14d,63.6,36.4,84.22,15.78,10.61,224,0.875,bilinear
 tf_mixnet_s,63.59,36.41,84.27,15.73,4.13,224,0.875,bicubic
-resnet26,63.45,36.55,84.27,15.73,16,224,0.875,bicubic
+resnet26,63.45,36.55,84.27,15.73,16.0,224,0.875,bicubic
 mixnet_s,63.38,36.62,84.71,15.29,4.13,224,0.875,bicubic
 mobilenetv3_large_100,63.36,36.64,84.08,15.92,5.48,224,0.875,bicubic
 tv_resnet50,63.33,36.67,84.65,15.35,25.56,224,0.875,bilinear
 mobilenetv3_rw,63.23,36.77,84.52,15.48,5.48,224,0.875,bicubic
 semnasnet_100,63.12,36.88,84.53,15.47,3.89,224,0.875,bicubic
-densenet121,62.94,37.06,84.26,15.74,7.98,224,0.875,bicubic
+tv_densenet121,62.94,37.06,84.26,15.74,7.98,224,0.875,bicubic
 seresnet34,62.89,37.11,84.22,15.78,21.96,224,0.875,bilinear
 hrnet_w18_small_v2,62.83,37.17,83.97,16.03,15.6,224,0.875,bilinear
 mobilenetv2_110d,62.82,37.18,84.48,15.52,4.52,224,0.875,bicubic
 resnet34,62.82,37.18,84.12,15.88,21.8,224,0.875,bilinear
 swsl_resnet18,62.73,37.27,84.3,15.7,11.69,224,0.875,bilinear
 tf_efficientnet_lite0,62.58,37.42,84.25,15.75,4.65,224,0.875,bicubic
-gluon_resnet34_v1b,62.56,37.44,84,16,21.8,224,0.875,bicubic
+gluon_resnet34_v1b,62.56,37.44,84.0,16.0,21.8,224,0.875,bicubic
 dla34,62.51,37.49,83.92,16.08,15.78,224,0.875,bilinear
 tf_mobilenetv3_large_100,62.47,37.53,83.96,16.04,5.48,224,0.875,bilinear
 fbnetc_100,62.43,37.57,83.39,16.61,5.57,224,0.875,bilinear
--- a/results/results-sketch.csv
+++ b/results/results-sketch.csv
@ -24,16 +24,22 @@ tf_efficientnet_b4_ap,40.4763,59.5237,61.7127,38.2873,19.34,380,0.922,bicubic
 tf_efficientnet_b3_ns,39.5822,60.4178,61.4632,38.5368,12.23,300,0.904,bicubic
 tf_efficientnet_b5,38.3285,61.6715,59.9285,40.0715,30.39,456,0.934,bicubic
 tf_efficientnet_b3_ap,37.0611,62.9389,57.2363,42.7637,12.23,300,0.904,bicubic
+resnest269e,36.67,63.33,56.8099,43.1901,110.93,416,0.875,bilinear
 tf_efficientnet_b2_ns,36.1768,63.8232,57.5547,42.4453,9.11,260,0.89,bicubic
 ecaresnet101d,36.0058,63.9942,56.1536,43.8464,44.57,224,0.875,bicubic
-swsl_resnet18,35.8604,64.1396,58.439,41.561,11.69,224,0.875,bilinear
+swsl_resnet18,35.8604,64.1396,58.437,41.563,11.69,224,0.875,bilinear
+resnest200e,35.8466,64.1534,55.8903,44.1097,70.2,320,0.875,bilinear
+resnest101e,35.3652,64.6348,55.7861,44.2139,48.28,256,0.875,bilinear
 ssl_resnext101_32x16d,34.6087,65.3913,55.9139,44.0861,194.03,224,0.875,bilinear
+resnest50d_4s2x40d,34.3611,65.6389,54.7112,45.2888,30.42,224,0.875,bicubic
 tf_efficientnet_b1_ns,34.1528,65.8472,55.4894,44.5106,7.79,240,0.882,bicubic
 tf_efficientnet_b4,34.0624,65.9376,54.216,45.784,19.34,380,0.922,bicubic
 ssl_resnext101_32x8d,34.0211,65.9789,55.5935,44.4065,88.79,224,0.875,bilinear
 tf_efficientnet_b6,34.0054,65.9946,54.5403,45.4597,43.04,528,0.942,bicubic
 efficientnet_b3_pruned,33.9956,66.0044,54.1099,45.8901,9.86,300,0.904,bicubic
 tresnet_xl,33.2587,66.7413,52.2962,47.7038,78.44,224,0.875,bilinear
+resnest50d_1s4x24d,33.1388,66.8612,52.8307,47.1693,25.68,224,0.875,bicubic
+resnest50d,32.9678,67.0322,52.701,47.299,27.48,224,0.875,bilinear
 tf_efficientnet_b3,32.8637,67.1363,52.9623,47.0377,12.23,300,0.904,bicubic
 inception_resnet_v2,32.736,67.264,50.6396,49.3604,55.84,299,0.8975,bicubic
 gluon_resnet152_v1d,32.7301,67.2699,51.0837,48.9163,60.21,224,0.875,bicubic
@ -45,7 +51,7 @@ ens_adv_inception_resnet_v2,32.3705,67.6295,50.4274,49.5726,55.84,299,0.8975,bic
 gluon_resnet152_v1s,32.3312,67.6688,50.5394,49.4606,60.32,224,0.875,bicubic
 gluon_seresnext101_64x4d,32.1936,67.8064,50.3272,49.6728,88.23,224,0.875,bicubic
 gluon_seresnext101_32x4d,32.115,67.885,51.2409,48.7591,48.96,224,0.875,bicubic
-efficientnet_b3a,31.7279,68.2721,51.3215,48.6785,12.23,320,1,bicubic
+efficientnet_b3a,31.7279,68.2721,51.3215,48.6785,12.23,320,1.0,bicubic
 efficientnet_b3,31.5648,68.4352,51.2724,48.7276,12.23,300,0.904,bicubic
 resnet50,31.5451,68.4549,50.1719,49.8281,25.56,224,0.875,bicubic
 ssl_resnext101_32x4d,31.4331,68.5669,52.1154,47.8846,44.18,224,0.875,bilinear
@ -62,11 +68,13 @@ ecaresnet101d_pruned,30.8947,69.1053,50.001,49.999,24.88,224,0.875,bicubic
 gluon_resnext101_32x4d,30.8809,69.1191,48.537,51.463,44.18,224,0.875,bicubic
 tf_efficientnet_lite4,30.8397,69.1603,50.3979,49.6021,13.01,380,0.92,bilinear
 dpn107,30.6805,69.3195,48.8062,51.1938,86.92,224,0.875,bicubic
+ese_vovnet39b,30.6766,69.3234,49.8929,50.1071,24.57,224,0.875,bicubic
 tresnet_xl_448,30.6196,69.3804,49.0715,50.9285,78.44,448,0.875,bilinear
-gluon_resnet152_v1b,30.6176,69.3824,48.5311,51.4689,60.19,224,0.875,bicubic
+gluon_resnet152_v1b,30.6176,69.3824,48.5292,51.4708,60.19,224,0.875,bicubic
 ssl_resnext50_32x4d,30.594,69.406,50.6534,49.3466,25.03,224,0.875,bilinear
 gluon_resnet101_v1d,30.5095,69.4905,47.975,52.025,44.57,224,0.875,bicubic
-efficientnet_b2a,30.4231,69.5769,49.6748,50.3252,9.11,288,1,bicubic
+resnest26d,30.4997,69.5003,50.677,49.323,17.07,224,0.875,bilinear
+efficientnet_b2a,30.4231,69.5769,49.6748,50.3252,9.11,288,1.0,bicubic
 tf_efficientnet_b1_ap,30.4191,69.5809,49.5529,50.4471,7.79,240,0.882,bicubic
 dpn98,30.0576,69.9424,48.2403,51.7597,61.57,224,0.875,bicubic
 tf_efficientnet_b2,30.0202,69.9798,49.5903,50.4097,9.11,260,0.89,bicubic
@ -75,14 +83,14 @@ senet154,30.0006,69.9994,48.032,51.968,115.09,224,0.875,bilinear
 dpn92,29.9691,70.0309,49.1599,50.8401,37.67,224,0.875,bicubic
 gluon_senet154,29.8866,70.1134,47.8728,52.1272,115.09,224,0.875,bicubic
 xception,29.8493,70.1507,48.6903,51.3097,22.86,299,0.8975,bicubic
-adv_inception_v3,29.8237,70.1763,47.8689,52.1311,23.83,299,0.875,bicubic
+adv_inception_v3,29.8237,70.1763,47.8669,52.1331,23.83,299,0.875,bicubic
 resnetblur50,29.6233,70.3767,48.2501,51.7499,25.56,224,0.875,bicubic
 efficientnet_b2,29.6174,70.3826,48.7728,51.2272,9.11,260,0.875,bicubic
 gluon_xception65,29.5545,70.4455,47.523,52.477,39.92,299,0.875,bicubic
 resnext101_32x8d,29.4347,70.5653,48.482,51.518,88.79,224,0.875,bilinear
 ssl_resnet50,29.4229,70.5771,49.773,50.227,25.56,224,0.875,bilinear
 resnext50_32x4d,29.3285,70.6715,47.3953,52.6047,25.03,224,0.875,bicubic
-ecaresnet50d_pruned,29.2165,70.7835,48.4604,51.5396,19.94,224,0.875,bicubic
+ecaresnet50d_pruned,29.2165,70.7835,48.4584,51.5416,19.94,224,0.875,bicubic
 tresnet_l_448,29.1674,70.8326,47.2342,52.7658,55.99,448,0.875,bilinear
 gluon_inception_v3,29.1143,70.8857,46.9433,53.0567,23.83,299,0.875,bicubic
 hrnet_w64,28.9866,71.0134,47.1399,52.8601,128.06,224,0.875,bilinear
@ -105,34 +113,49 @@ tf_efficientnet_cc_b0_4e,28.3106,71.6894,47.3639,52.6361,13.31,224,0.875,bicubic
 mixnet_xl,28.293,71.707,46.7174,53.2826,11.9,224,0.875,bicubic
 gluon_resnet50_v1d,28.236,71.764,45.8763,54.1237,25.58,224,0.875,bicubic
 wide_resnet101_2,28.1063,71.8937,46.4246,53.5754,126.89,224,0.875,bilinear
+gluon_resnet101_v1c,28.1023,71.8977,45.953,54.047,44.57,224,0.875,bicubic
 densenet161,28.1004,71.8996,46.6506,53.3494,28.68,224,0.875,bicubic
-gluon_resnet101_v1c,28.1004,71.8996,45.953,54.047,44.57,224,0.875,bicubic
+regnetx_320,28.0788,71.9212,45.1198,54.8802,107.81,224,0.875,bicubic
+regnety_320,28.0709,71.9291,45.4597,54.5403,145.05,224,0.875,bicubic
 dpn68b,27.8842,72.1158,47.4602,52.5398,12.61,224,0.875,bicubic
-tf_inception_v3,27.784,72.216,45.7132,54.2868,23.83,299,0.875,bicubic
+regnetx_160,27.8253,72.1747,45.6307,54.3693,54.28,224,0.875,bicubic
+tf_inception_v3,27.786,72.214,45.7113,54.2887,23.83,299,0.875,bicubic
 res2net101_26w_4s,27.7742,72.2258,45.1709,54.8291,45.21,224,0.875,bilinear
+regnety_160,27.6386,72.3614,45.5344,54.4656,83.59,224,0.875,bicubic
 hrnet_w44,27.6248,72.3752,45.8311,54.1689,67.06,224,0.875,bilinear
-inception_v3,27.5698,72.4302,45.2632,54.7368,27.16,299,0.875,bicubic
+inception_v3,27.5698,72.4302,45.2613,54.7387,23.83,299,0.875,bicubic
+regnetx_080,27.4106,72.5894,45.0215,54.9785,39.57,224,0.875,bicubic
 hrnet_w30,27.3851,72.6149,46.5425,53.4575,37.71,224,0.875,bilinear
 hrnet_w32,27.3772,72.6228,45.9903,54.0097,41.23,224,0.875,bilinear
-gluon_resnet50_v1s,27.3281,72.6719,45.2141,54.7859,25.68,224,0.875,bicubic
+gluon_resnet50_v1s,27.3261,72.6739,45.2141,54.7859,25.68,224,0.875,bicubic
 densenet201,27.2613,72.7387,46.2241,53.7759,20.01,224,0.875,bicubic
+regnety_064,27.2279,72.7721,44.8506,55.1494,30.58,224,0.875,bicubic
+densenetblur121d,27.224,72.776,46.3067,53.6933,8.0,224,0.875,bicubic
 efficientnet_b1_pruned,27.1945,72.8055,45.8724,54.1276,6.33,240,0.882,bicubic
 res2net50_26w_8s,27.0726,72.9274,44.432,55.568,48.4,224,0.875,bilinear
 dla102x,27.0235,72.9765,45.4951,54.5049,26.77,224,0.875,bilinear
 resnet101,26.9685,73.0315,45.2357,54.7643,44.55,224,0.875,bilinear
 resnext50d_32x4d,26.8742,73.1258,44.43,55.57,25.05,224,0.875,bicubic
+regnetx_120,26.8644,73.1356,44.6816,55.3184,46.11,224,0.875,bicubic
 seresnext101_32x4d,26.8192,73.1808,43.5084,56.4916,48.96,224,0.875,bilinear
 densenet169,26.8113,73.1887,45.3752,54.6248,14.15,224,0.875,bicubic
+regnetx_064,26.8015,73.1985,44.9036,55.0964,26.21,224,0.875,bicubic
+regnety_120,26.7818,73.2182,44.4399,55.5601,51.82,224,0.875,bicubic
+regnetx_032,26.7071,73.2929,45.2259,54.7741,15.3,224,0.875,bicubic
+densenet121,26.6757,73.3243,45.8999,54.1001,7.98,224,0.875,bicubic
 seresnet152,26.6718,73.3282,43.9447,56.0553,66.82,224,0.875,bilinear
 tf_efficientnet_el,26.6226,73.3774,44.6364,55.3636,10.59,300,0.904,bicubic
 efficientnet_es,26.6168,73.3832,45.106,54.894,5.44,224,0.875,bicubic
 res2net50_26w_6s,26.5873,73.4127,43.9781,56.0219,37.05,224,0.875,bilinear
 dla60x,26.5637,73.4363,45.0392,54.9608,17.65,224,0.875,bilinear
+regnety_080,26.5146,73.4854,44.3554,55.6446,39.18,224,0.875,bicubic
 tf_efficientnet_b0,26.491,73.509,45.6562,54.3438,5.29,224,0.875,bicubic
 res2net50_14w_8s,26.4713,73.5287,44.3691,55.6309,25.06,224,0.875,bilinear
 gluon_resnet50_v1b,26.432,73.568,44.0331,55.9669,25.56,224,0.875,bicubic
+regnetx_040,26.2395,73.7605,44.4241,55.5759,22.12,224,0.875,bicubic
 dpn68,26.1216,73.8784,44.2335,55.7665,12.61,224,0.875,bicubic
 hrnet_w18,25.9761,74.0239,44.8093,55.1907,21.3,224,0.875,bilinear
+regnety_040,25.9133,74.0867,43.8543,56.1457,20.65,224,0.875,bicubic
 resnet34,25.8838,74.1162,43.9899,56.0101,21.8,224,0.875,bilinear
 res2net50_26w_4s,25.87,74.13,43.1606,56.8394,25.7,224,0.875,bilinear
 tresnet_m_448,25.8504,74.1496,42.8678,57.1322,31.39,448,0.875,bilinear
@ -148,21 +171,25 @@ tf_mixnet_l,25.42,74.58,42.5436,57.4564,7.33,224,0.875,bicubic
 res2next50,25.3945,74.6055,42.4925,57.5075,24.67,224,0.875,bilinear
 selecsls60b,25.3277,74.6723,43.5536,56.4464,32.77,224,0.875,bicubic
 seresnet101,25.3277,74.6723,42.8285,57.1715,49.33,224,0.875,bilinear
+regnety_032,25.3237,74.6763,42.9071,57.0929,19.44,224,0.875,bicubic
 dla102,25.3139,74.6861,43.8366,56.1634,33.73,224,0.875,bilinear
 wide_resnet50_2,25.31,74.69,42.1781,57.8219,68.88,224,0.875,bilinear
+resnest14d,25.2825,74.7175,44.1215,55.8785,10.61,224,0.875,bilinear
 seresnext50_32x4d,25.2176,74.7824,41.9383,58.0617,27.56,224,0.875,bilinear
 res2net50_48w_2s,25.0231,74.9769,42.2017,57.7983,25.29,224,0.875,bilinear
 efficientnet_b0,25.0152,74.9848,42.7853,57.2147,5.29,224,0.875,bicubic
 gluon_resnet34_v1b,24.9484,75.0516,42.237,57.763,21.8,224,0.875,bicubic
 mobilenetv2_120d,24.9327,75.0673,43.0643,56.9357,5.83,224,0.875,bicubic
 dla60,24.9268,75.0732,43.3021,56.6979,22.33,224,0.875,bilinear
+regnety_016,24.8187,75.1813,42.6261,57.3739,11.2,224,0.875,bicubic
 tf_efficientnet_em,24.5338,75.4662,42.41,57.59,6.9,240,0.882,bicubic
 tf_efficientnet_lite2,24.5299,75.4701,42.292,57.708,6.09,260,0.89,bicubic
 skresnet18,24.4945,75.5055,42.5377,57.4623,11.96,224,0.875,bicubic
+regnetx_016,24.4768,75.5232,42.5023,57.4977,9.19,224,0.875,bicubic
 tf_efficientnet_lite0,24.3707,75.6293,42.5102,57.4898,4.65,224,0.875,bicubic
 tv_resnet50,24.0917,75.9083,41.3095,58.6905,25.56,224,0.875,bilinear
 seresnet34,24.0366,75.9634,41.8951,58.1049,21.96,224,0.875,bilinear
-densenet121,23.846,76.154,41.9207,58.0793,7.98,224,0.875,bicubic
+tv_densenet121,23.846,76.154,41.9207,58.0793,7.98,224,0.875,bicubic
 tf_efficientnet_es,23.8244,76.1756,41.3193,58.6807,5.44,224,0.875,bicubic
 mobilenetv2_140,23.7104,76.2896,41.4687,58.5313,6.11,224,0.875,bicubic
 mixnet_m,23.7085,76.2915,41.1386,58.8614,5.01,224,0.875,bicubic
@ -176,26 +203,34 @@ mobilenetv3_large_100,22.665,77.335,40.7848,59.2152,5.48,224,0.875,bicubic
 mobilenetv3_rw,22.6257,77.3743,40.3702,59.6298,5.48,224,0.875,bicubic
 tf_mobilenetv3_large_100,22.5707,77.4293,39.7591,60.2409,5.48,224,0.875,bilinear
 hrnet_w18_small_v2,22.3408,77.6592,39.8475,60.1525,15.6,224,0.875,bilinear
+regnety_008,22.1128,77.8872,38.8964,61.1036,6.26,224,0.875,bicubic
 seresnext26tn_32x4d,22.0028,77.9972,38.4916,61.5084,16.81,224,0.875,bicubic
 seresnext26t_32x4d,21.9871,78.0129,38.5663,61.4337,16.82,224,0.875,bicubic
+regnety_006,21.9733,78.0267,38.9534,61.0466,6.06,224,0.875,bicubic
+regnetx_008,21.9517,78.0483,38.9298,61.0702,7.26,224,0.875,bicubic
 resnet26d,21.9144,78.0856,38.6174,61.3826,16.01,224,0.875,bicubic
 semnasnet_100,21.8967,78.1033,38.6036,61.3964,3.89,224,0.875,bicubic
+regnetx_006,21.7434,78.2566,38.9043,61.0957,6.2,224,0.875,bicubic
 gluon_resnet18_v1b,21.5449,78.4551,38.8728,61.1272,11.69,224,0.875,bicubic
 fbnetc_100,21.4919,78.5081,38.1654,61.8346,5.57,224,0.875,bilinear
 mnasnet_100,21.3504,78.6496,37.7154,62.2846,4.38,224,0.875,bicubic
-resnet26,21.2954,78.7046,38.0161,61.9839,16,224,0.875,bicubic
+resnet26,21.2954,78.7046,38.0161,61.9839,16.0,224,0.875,bicubic
 ssl_resnet18,21.2777,78.7223,39.1145,60.8855,11.69,224,0.875,bilinear
 mixnet_s,21.258,78.742,38.1929,61.8071,4.13,224,0.875,bicubic
 seresnext26d_32x4d,21.2541,78.7459,37.2851,62.7149,16.81,224,0.875,bicubic
 seresnext26_32x4d,21.093,78.907,37.6388,62.3612,16.79,224,0.875,bicubic
+regnetx_004,20.8866,79.1134,37.5484,62.4516,5.16,224,0.875,bicubic
 spnasnet_100,20.867,79.133,37.8923,62.1077,4.42,224,0.875,bilinear
 seresnet18,20.8395,79.1605,37.6447,62.3553,11.78,224,0.875,bicubic
 mobilenetv2_100,20.7609,79.2391,37.7508,62.2492,3.5,224,0.875,bicubic
 tf_mixnet_s,20.4779,79.5221,36.6268,63.3732,4.13,224,0.875,bicubic
+regnety_004,20.417,79.583,37.0296,62.9704,4.34,224,0.875,bicubic
 tf_mobilenetv3_large_075,20.3718,79.6282,36.7702,63.2298,3.99,224,0.875,bilinear
 hrnet_w18_small,20.3659,79.6341,37.0945,62.9055,13.19,224,0.875,bilinear
 resnet18,20.2283,79.7717,37.2595,62.7405,11.69,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,20.1163,79.8837,36.9038,63.0962,3.92,224,0.875,bilinear
+regnety_002,17.4596,82.5404,32.4432,67.5568,3.16,224,0.875,bicubic
+regnetx_002,16.9506,83.0494,32.2349,67.7651,2.68,224,0.875,bicubic
 dla60x_c,16.3257,83.6743,31.775,68.225,1.34,224,0.875,bilinear
 tf_mobilenetv3_small_100,16.2334,83.7666,31.2229,68.7771,2.54,224,0.875,bilinear
 tf_mobilenetv3_small_075,14.9404,85.0596,29.5722,70.4278,2.04,224,0.875,bilinear
--- a/sotabench.py
+++ b/sotabench.py
@ -135,6 +135,12 @@ model_list = [

    _entry('resnetblur50', 'ResNet-Blur-50', '1904.11486'),

+    _entry('densenet121', 'DenseNet-121', '1608.06993'),
+    _entry('densenetblur121d', 'DenseNet-Blur-121D', '1904.11486',
+           model_desc='DenseNet with blur pooling and deep stem'),
+
+    _entry('ese_vovnet39b', 'VoVNet-39-V2', '1911.06667'),
+
    _entry('tf_efficientnet_b0', 'EfficientNet-B0 (AutoAugment)', '1905.11946',
           model_desc='Ported from official Google AI Tensorflow weights'),
    _entry('tf_efficientnet_b1', 'EfficientNet-B1 (AutoAugment)', '1905.11946',
@ -389,6 +395,34 @@ model_list = [
           model_desc='Originally from https://github.com/mehtadushy/SelecSLS-Pytorch'),
    _entry('selecsls60b', 'SelecSLS-60_B', '1907.00837',
           model_desc='Originally from https://github.com/mehtadushy/SelecSLS-Pytorch'),
+
+    ## RegNet official impl weighs
+    _entry('regnetx_002', 'RegNetX-200MF', '2003.13678'),
+    _entry('regnetx_004', 'RegNetX-400MF', '2003.13678'),
+    _entry('regnetx_006', 'RegNetX-600MF', '2003.13678'),
+    _entry('regnetx_008', 'RegNetX-800MF', '2003.13678'),
+    _entry('regnetx_016', 'RegNetX-1.6GF', '2003.13678'),
+    _entry('regnetx_032', 'RegNetX-3.2GF', '2003.13678'),
+    _entry('regnetx_040', 'RegNetX-4.0GF', '2003.13678'),
+    _entry('regnetx_064', 'RegNetX-6.4GF', '2003.13678'),
+    _entry('regnetx_080', 'RegNetX-8.0GF', '2003.13678'),
+    _entry('regnetx_120', 'RegNetX-12GF', '2003.13678'),
+    _entry('regnetx_160', 'RegNetX-16GF', '2003.13678'),
+    _entry('regnetx_320', 'RegNetX-32GF', '2003.13678', batch_size=BATCH_SIZE // 2),
+
+    _entry('regnety_002', 'RegNetY-200MF', '2003.13678'),
+    _entry('regnety_004', 'RegNetY-400MF', '2003.13678'),
+    _entry('regnety_006', 'RegNetY-600MF', '2003.13678'),
+    _entry('regnety_008', 'RegNetY-800MF', '2003.13678'),
+    _entry('regnety_016', 'RegNetY-1.6GF', '2003.13678'),
+    _entry('regnety_032', 'RegNetY-3.2GF', '2003.13678'),
+    _entry('regnety_040', 'RegNetY-4.0GF', '2003.13678'),
+    _entry('regnety_064', 'RegNetY-6.4GF', '2003.13678'),
+    _entry('regnety_080', 'RegNetY-8.0GF', '2003.13678'),
+    _entry('regnety_120', 'RegNetY-12GF', '2003.13678'),
+    _entry('regnety_160', 'RegNetY-16GF', '2003.13678'),
+    _entry('regnety_320', 'RegNetY-32GF', '2003.13678', batch_size=BATCH_SIZE // 2),
+
 ]

 for m in model_list:
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@ -0,0 +1,71 @@
+import pytest
+import torch
+import torch.nn as nn
+import platform
+import os
+
+from timm.models.layers import create_act_layer, get_act_layer, set_layer_config
+
+
+class MLP(nn.Module):
+    def __init__(self, act_layer="relu"):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(1000, 100)
+        self.act = create_act_layer(act_layer, inplace=True)
+        self.fc2 = nn.Linear(100, 10)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+def _run_act_layer_grad(act_type):
+    x = torch.rand(10, 1000) * 10
+    m = MLP(act_layer=act_type)
+
+    def _run(x, act_layer=''):
+        if act_layer:
+            # replace act layer if set
+            m.act = create_act_layer(act_layer, inplace=True)
+        out = m(x)
+        l = (out - 0).pow(2).sum()
+        return l
+
+    out_me = _run(x)
+
+    with set_layer_config(scriptable=True):
+        out_jit = _run(x, act_type)
+
+    assert torch.isclose(out_jit, out_me)
+
+    with set_layer_config(no_jit=True):
+        out_basic = _run(x, act_type)
+
+    assert torch.isclose(out_basic, out_jit)
+
+
+def test_swish_grad():
+    for _ in range(100):
+        _run_act_layer_grad('swish')
+
+
+def test_mish_grad():
+    for _ in range(100):
+        _run_act_layer_grad('mish')
+
+
+def test_hard_sigmoid_grad():
+    for _ in range(100):
+        _run_act_layer_grad('hard_sigmoid')
+
+
+def test_hard_swish_grad():
+    for _ in range(100):
+        _run_act_layer_grad('hard_swish')
+
+
+def test_hard_mish_grad():
+    for _ in range(100):
+        _run_act_layer_grad('hard_mish')
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -4,7 +4,7 @@ import platform
 import os
 import fnmatch

-from timm import list_models, create_model
+from timm import list_models, create_model, set_scriptable


 if 'GITHUB_ACTIONS' in os.environ and 'Linux' in platform.system():
@ -53,6 +53,8 @@ def test_model_backward(model_name, batch_size):
    inputs = torch.randn((batch_size, *input_size))
    outputs = model(inputs)
    outputs.mean().backward()
+    for n, x in model.named_parameters():
+        assert x.grad is not None, f'No gradient for {n}'
    num_grad = sum([x.grad.numel() for x in model.parameters() if x.grad is not None])

    assert outputs.shape[-1] == 42
@ -83,3 +85,25 @@ def test_model_default_cfgs(model_name, batch_size):
        assert outputs.shape[-1] == pool_size[-1] and outputs.shape[-2] == pool_size[-2]
    assert any([k.startswith(classifier) for k in state_dict.keys()]), f'{classifier} not in model params'
    assert any([k.startswith(first_conv) for k in state_dict.keys()]), f'{first_conv} not in model params'
+
+
+EXCLUDE_JIT_FILTERS = [
+    '*iabn*', 'tresnet*',  # models using inplace abn unlikely to ever be scriptable
+    'dla*', 'hrnet*',  # hopefully fix at some point
+]
+
+
+@pytest.mark.timeout(120)
+@pytest.mark.parametrize('model_name', list_models(exclude_filters=EXCLUDE_FILTERS + EXCLUDE_JIT_FILTERS))
+@pytest.mark.parametrize('batch_size', [1])
+def test_model_forward_torchscript(model_name, batch_size):
+    """Run a single forward pass with each model"""
+    with set_scriptable(True):
+        model = create_model(model_name, pretrained=False)
+    model.eval()
+    input_size = (3, 128, 128)  # jit compile is already a bit slow and we've tested normal res already...
+    model = torch.jit.script(model)
+    outputs = model(torch.randn((batch_size, *input_size)))
+
+    assert outputs.shape[0] == batch_size
+    assert not torch.isnan(outputs).any(), 'Output included NaNs'
--- a/timm/init.py
+++ b/timm/init.py
@ -1,2 +1,3 @@
 from .version import __version__
-from .models import create_model, list_models, is_model, list_modules, model_entrypoint
+from .models import create_model, list_models, is_model, list_modules, model_entrypoint, \
+    is_scriptable, is_exportable, set_scriptable, set_exportable
--- a/timm/models/init.py
+++ b/timm/models/init.py
@ -20,9 +20,11 @@ from .sknet import *
 from .tresnet import *
 from .resnest import *
 from .regnet import *
+from .vovnet import *

 from .registry import *
 from .factory import create_model
 from .helpers import load_checkpoint, resume_checkpoint
 from .layers import TestTimePoolHead, apply_test_time_pool
 from .layers import convert_splitbn_model
+from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit
--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@ -2,17 +2,20 @@
 This file is a copy of https://github.com/pytorch/vision 'densenet.py' (BSD-3-Clause) with
 fixed kwargs passthrough and addition of dynamic global avg/max pool.
 """
+import re
 from collections import OrderedDict
+from functools import partial

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from torch.jit.annotations import List

-from .registry import register_model
-from .helpers import load_pretrained
-from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-import re
+from .helpers import load_pretrained
+from .layers import SelectAdaptivePool2d, BatchNormAct2d, create_norm_act, BlurPool2d
+from .registry import register_model

 __all__ = ['DenseNet']

@ -27,124 +30,242 @@ def _cfg(url=''):


 default_cfgs = {
-    'densenet121': _cfg(url='https://download.pytorch.org/models/densenet121-a639ec97.pth'),
+    'densenet121': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenet121_ra-50efcf5c.pth'),
+    'densenet121d': _cfg(url=''),
+    'densenetblur121d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenetblur121d_ra-100dcfbc.pth'),
    'densenet169': _cfg(url='https://download.pytorch.org/models/densenet169-b2777c0a.pth'),
    'densenet201': _cfg(url='https://download.pytorch.org/models/densenet201-c1103571.pth'),
    'densenet161': _cfg(url='https://download.pytorch.org/models/densenet161-8d451a50.pth'),
+    'densenet264': _cfg(url=''),
+    'densenet264d_iabn': _cfg(url=''),
+    'tv_densenet121': _cfg(url='https://download.pytorch.org/models/densenet121-a639ec97.pth'),
 }


-class _DenseLayer(nn.Sequential):
-    def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
-        super(_DenseLayer, self).__init__()
-        self.add_module('norm1', nn.BatchNorm2d(num_input_features)),
-        self.add_module('relu1', nn.ReLU(inplace=True)),
-        self.add_module('conv1', nn.Conv2d(num_input_features, bn_size *
-                        growth_rate, kernel_size=1, stride=1, bias=False)),
-        self.add_module('norm2', nn.BatchNorm2d(bn_size * growth_rate)),
-        self.add_module('relu2', nn.ReLU(inplace=True)),
-        self.add_module('conv2', nn.Conv2d(bn_size * growth_rate, growth_rate,
-                        kernel_size=3, stride=1, padding=1, bias=False)),
-        self.drop_rate = drop_rate
+class DenseLayer(nn.Module):
+    def __init__(self, num_input_features, growth_rate, bn_size, norm_layer=BatchNormAct2d,
+                 drop_rate=0., memory_efficient=False):
+        super(DenseLayer, self).__init__()
+        self.add_module('norm1', norm_layer(num_input_features)),
+        self.add_module('conv1', nn.Conv2d(
+            num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)),
+        self.add_module('norm2', norm_layer(bn_size * growth_rate)),
+        self.add_module('conv2', nn.Conv2d(
+            bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)),
+        self.drop_rate = float(drop_rate)
+        self.memory_efficient = memory_efficient
+
+    def bottleneck_fn(self, xs):
+        # type: (List[torch.Tensor]) -> torch.Tensor
+        concated_features = torch.cat(xs, 1)
+        bottleneck_output = self.conv1(self.norm1(concated_features))  # noqa: T484
+        return bottleneck_output
+
+    # todo: rewrite when torchscript supports any
+    def any_requires_grad(self, x):
+        # type: (List[torch.Tensor]) -> bool
+        for tensor in x:
+            if tensor.requires_grad:
+                return True
+        return False
+
+    @torch.jit.unused  # noqa: T484
+    def call_checkpoint_bottleneck(self, x):
+        # type: (List[torch.Tensor]) -> torch.Tensor
+        def closure(*xs):
+            return self.bottleneck_fn(xs)
+
+        return cp.checkpoint(closure, *x)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (List[torch.Tensor]) -> (torch.Tensor)
+        pass

+    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
-        new_features = super(_DenseLayer, self).forward(x)
+        # type: (torch.Tensor) -> (torch.Tensor)
+        pass
+
+    # torchscript does not yet support *args, so we overload method
+    # allowing it to take either a List[Tensor] or single Tensor
+    def forward(self, x):  # noqa: F811
+        if isinstance(x, torch.Tensor):
+            prev_features = [x]
+        else:
+            prev_features = x
+
+        if self.memory_efficient and self.any_requires_grad(prev_features):
+            if torch.jit.is_scripting():
+                raise Exception("Memory Efficient not supported in JIT")
+            bottleneck_output = self.call_checkpoint_bottleneck(prev_features)
+        else:
+            bottleneck_output = self.bottleneck_fn(prev_features)
+
+        new_features = self.conv2(self.norm2(bottleneck_output))
        if self.drop_rate > 0:
            new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
-        return torch.cat([x, new_features], 1)
+        return new_features


-class _DenseBlock(nn.Sequential):
-    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
-        super(_DenseBlock, self).__init__()
+class DenseBlock(nn.ModuleDict):
+    _version = 2
+
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate, norm_layer=nn.ReLU,
+                 drop_rate=0., memory_efficient=False):
+        super(DenseBlock, self).__init__()
        for i in range(num_layers):
-            layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
+            layer = DenseLayer(
+                num_input_features + i * growth_rate,
+                growth_rate=growth_rate,
+                bn_size=bn_size,
+                norm_layer=norm_layer,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient,
+            )
            self.add_module('denselayer%d' % (i + 1), layer)

+    def forward(self, init_features):
+        features = [init_features]
+        for name, layer in self.items():
+            new_features = layer(features)
+            features.append(new_features)
+        return torch.cat(features, 1)
+

-class _Transition(nn.Sequential):
-    def __init__(self, num_input_features, num_output_features):
-        super(_Transition, self).__init__()
-        self.add_module('norm', nn.BatchNorm2d(num_input_features))
-        self.add_module('relu', nn.ReLU(inplace=True))
-        self.add_module('conv', nn.Conv2d(num_input_features, num_output_features,
-                                          kernel_size=1, stride=1, bias=False))
-        self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))
+class DenseTransition(nn.Sequential):
+    def __init__(self, num_input_features, num_output_features, norm_layer=nn.BatchNorm2d, aa_layer=None):
+        super(DenseTransition, self).__init__()
+        self.add_module('norm', norm_layer(num_input_features))
+        self.add_module('conv', nn.Conv2d(
+            num_input_features, num_output_features, kernel_size=1, stride=1, bias=False))
+        if aa_layer is not None:
+            self.add_module('pool', aa_layer(num_output_features, stride=2))
+        else:
+            self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2))


 class DenseNet(nn.Module):
    r"""Densenet-BC model class, based on
-    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_

    Args:
        growth_rate (int) - how many filters to add each layer (`k` in paper)
        block_config (list of 4 ints) - how many layers in each pooling block
-        num_init_features (int) - the number of filters to learn in the first convolution layer
        bn_size (int) - multiplicative factor for number of bottle neck layers
          (i.e. bn_size * k features in the bottleneck layer)
        drop_rate (float) - dropout rate after each dense layer
        num_classes (int) - number of classification classes
+        memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient,
+          but slower. Default: *False*. See `"paper" <https://arxiv.org/pdf/1707.06990.pdf>`_
    """
-    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16),
-                 num_init_features=64, bn_size=4, drop_rate=0,
-                 num_classes=1000, in_chans=3, global_pool='avg'):
+
+    def __init__(self, growth_rate=32, block_config=(6, 12, 24, 16), bn_size=4, stem_type='',
+                 num_classes=1000, in_chans=3, global_pool='avg',
+                 norm_layer=BatchNormAct2d, aa_layer=None, drop_rate=0, memory_efficient=False,
+                 aa_stem_only=True):
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        super(DenseNet, self).__init__()

-        # First convolution
-        self.features = nn.Sequential(OrderedDict([
-            ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
-            ('norm0', nn.BatchNorm2d(num_init_features)),
-            ('relu0', nn.ReLU(inplace=True)),
-            ('pool0', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-        ]))
-
-        # Each denseblock
+        # Stem
+        deep_stem = 'deep' in stem_type  # 3x3 deep stem
+        num_init_features = growth_rate * 2
+        if aa_layer is None:
+            stem_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            stem_pool = nn.Sequential(*[
+                nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                aa_layer(channels=num_init_features, stride=2)])
+        if deep_stem:
+            stem_chs_1 = stem_chs_2 = growth_rate
+            if 'tiered' in stem_type:
+                stem_chs_1 = 3 * (growth_rate // 4)
+                stem_chs_2 = num_init_features if 'narrow' in stem_type else 6 * (growth_rate // 4)
+            self.features = nn.Sequential(OrderedDict([
+                ('conv0', nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False)),
+                ('norm0', norm_layer(stem_chs_1)),
+                ('conv1', nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False)),
+                ('norm1', norm_layer(stem_chs_2)),
+                ('conv2', nn.Conv2d(stem_chs_2, num_init_features, 3, stride=1, padding=1, bias=False)),
+                ('norm2', norm_layer(num_init_features)),
+                ('pool0', stem_pool),
+            ]))
+        else:
+            self.features = nn.Sequential(OrderedDict([
+                ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)),
+                ('norm0', norm_layer(num_init_features)),
+                ('pool0', stem_pool),
+            ]))
+
+        # DenseBlocks
        num_features = num_init_features
        for i, num_layers in enumerate(block_config):
-            block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
-                                bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
+            block = DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bn_size,
+                growth_rate=growth_rate,
+                norm_layer=norm_layer,
+                drop_rate=drop_rate,
+                memory_efficient=memory_efficient
+            )
            self.features.add_module('denseblock%d' % (i + 1), block)
            num_features = num_features + num_layers * growth_rate
+            transition_aa_layer = None if aa_stem_only else aa_layer
            if i != len(block_config) - 1:
-                trans = _Transition(
-                    num_input_features=num_features, num_output_features=num_features // 2)
+                trans = DenseTransition(
+                    num_input_features=num_features, num_output_features=num_features // 2,
+                    norm_layer=norm_layer, aa_layer=transition_aa_layer)
                self.features.add_module('transition%d' % (i + 1), trans)
                num_features = num_features // 2

        # Final batch norm
-        self.features.add_module('norm5', nn.BatchNorm2d(num_features))
+        self.features.add_module('norm5', norm_layer(num_features))

        # Linear layer
        self.num_features = num_features
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.classifier = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)

+        # Official init from torch repo.
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.constant_(m.bias, 0)
+
    def get_classifier(self):
        return self.classifier

    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.classifier = nn.Linear(num_features, num_classes)
+        else:
+            self.classifier = nn.Identity()

    def forward_features(self, x):
-        x = self.features(x)
-        x = F.relu(x, inplace=True)
-        return x
+        return self.features(x)

    def forward(self, x):
        x = self.forward_features(x)
        x = self.global_pool(x).flatten(1)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        # both classifier and block drop?
+        # if self.drop_rate > 0.:
+        #     x = F.dropout(x, p=self.drop_rate, training=self.training)
        x = self.classifier(x)
        return x


-def _filter_pretrained(state_dict):
+def _filter_torchvision_pretrained(state_dict):
    pattern = re.compile(
        r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$')

@ -157,58 +278,117 @@ def _filter_pretrained(state_dict):
    return state_dict


+def _densenet(variant, growth_rate, block_config, pretrained, **kwargs):
+    if kwargs.pop('features_only', False):
+        assert False, 'Not Implemented'  # TODO
+        load_strict = False
+        kwargs.pop('num_classes', 0)
+        model_class = DenseNet
+    else:
+        load_strict = True
+        model_class = DenseNet
+    default_cfg = default_cfgs[variant]
+    model = model_class(growth_rate=growth_rate, block_config=block_config, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model, default_cfg,
+            num_classes=kwargs.get('num_classes', 0),
+            in_chans=kwargs.get('in_chans', 3),
+            filter_fn=_filter_torchvision_pretrained,
+            strict=load_strict)
+    return model
+

@register_model
-def densenet121(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def densenet121(pretrained=False, **kwargs):
    r"""Densenet-121 model from
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
    """
-    default_cfg = default_cfgs['densenet121']
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
-                     num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans, filter_fn=_filter_pretrained)
+    model = _densenet(
+        'densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
    return model


@register_model
-def densenet169(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def densenetblur121d(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _densenet(
+        'densenetblur121d', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, stem_type='deep',
+        aa_layer=BlurPool2d, **kwargs)
+    return model
+
+
+@register_model
+def densenet121d(pretrained=False, **kwargs):
+    r"""Densenet-121 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _densenet(
+        'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep',
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet169(pretrained=False, **kwargs):
    r"""Densenet-169 model from
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
    """
-    default_cfg = default_cfgs['densenet169']
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
-                     num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans, filter_fn=_filter_pretrained)
+    model = _densenet(
+        'densenet169', growth_rate=32, block_config=(6, 12, 32, 32), pretrained=pretrained, **kwargs)
    return model


@register_model
-def densenet201(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+def densenet201(pretrained=False, **kwargs):
    r"""Densenet-201 model from
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
    """
-    default_cfg = default_cfgs['densenet201']
-    model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
-                     num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans, filter_fn=_filter_pretrained)
+    model = _densenet(
+        'densenet201', growth_rate=32, block_config=(6, 12, 48, 32), pretrained=pretrained, **kwargs)
    return model


@register_model
-def densenet161(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
-    r"""Densenet-201 model from
+def densenet161(pretrained=False, **kwargs):
+    r"""Densenet-161 model from
    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
    """
-    default_cfg = default_cfgs['densenet161']
-    model = DenseNet(num_init_features=96, growth_rate=48, block_config=(6, 12, 36, 24),
-                     num_classes=num_classes, in_chans=in_chans, **kwargs)
-    model.default_cfg = default_cfg
-    if pretrained:
-        load_pretrained(model, default_cfg, num_classes, in_chans, filter_fn=_filter_pretrained)
+    model = _densenet(
+        'densenet161', growth_rate=48, block_config=(6, 12, 36, 24), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet264(pretrained=False, **kwargs):
+    r"""Densenet-264 model from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _densenet(
+        'densenet264', growth_rate=48, block_config=(6, 12, 64, 48), pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def densenet264d_iabn(pretrained=False, **kwargs):
+    r"""Densenet-264 model with deep stem and Inplace-ABN
+    """
+    def norm_act_fn(num_features, **kwargs):
+        return create_norm_act('iabn', num_features, **kwargs)
+    model = _densenet(
+        'densenet264d_iabn', growth_rate=48, block_config=(6, 12, 64, 48), stem_type='deep',
+        norm_layer=norm_act_fn, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def tv_densenet121(pretrained=False, **kwargs):
+    r"""Densenet-121 model with original Torchvision weights, from
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`
+    """
+    model = _densenet(
+        'tv_densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs)
    return model
--- a/timm/models/dla.py
+++ b/timm/models/dla.py
@ -11,11 +11,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
+from .registry import register_model

 __all__ = ['DLA']

@ -51,6 +50,7 @@ default_cfgs = {

 class DlaBasic(nn.Module):
    """DLA Basic"""
+
    def __init__(self, inplanes, planes, stride=1, dilation=1, **_):
        super(DlaBasic, self).__init__()
        self.conv1 = nn.Conv2d(
@ -170,7 +170,7 @@ class DlaBottle2neck(nn.Module):
            sp = bn(sp)
            sp = self.relu(sp)
            spo.append(sp)
-        if self.scale > 1 :
+        if self.scale > 1:
            spo.append(self.pool(spx[-1]) if self.is_first else spx[-1])
        out = torch.cat(spo, 1)

@ -307,9 +307,10 @@ class DLA(nn.Module):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        if num_classes:
-            self.fc = nn.Conv2d(self.num_features * self.global_pool.feat_mult(), num_classes, 1, bias=True)
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.fc = nn.Conv2d(num_features, num_classes, kernel_size=1, bias=True)
        else:
-            self.fc = None
+            self.fc = nn.Identity()

    def forward_features(self, x):
        x = self.base_layer(x)
--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@ -9,16 +9,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from collections import OrderedDict
+from typing import Tuple
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from collections import OrderedDict

-from .registry import register_model
+from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD
-
+from .registry import register_model

 __all__ = ['DPN']

@ -54,8 +55,19 @@ class CatBnAct(nn.Module):
        self.bn = nn.BatchNorm2d(in_chs, eps=0.001)
        self.act = activation_fn

+    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
-        x = torch.cat(x, dim=1) if isinstance(x, tuple) else x
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> (torch.Tensor)
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> (torch.Tensor)
+        pass
+
+    def forward(self, x):
+        if isinstance(x, tuple):
+            x = torch.cat(x, dim=1)
        return self.act(self.bn(x))


@ -107,6 +119,8 @@ class DualPathBlock(nn.Module):
            self.key_stride = 1
            self.has_proj = False

+        self.c1x1_w_s1 = None
+        self.c1x1_w_s2 = None
        if self.has_proj:
            # Using different member names here to allow easier parameter key matching for conversion
            if self.key_stride == 2:
@ -115,6 +129,7 @@ class DualPathBlock(nn.Module):
            else:
                self.c1x1_w_s1 = BnActConv2d(
                    in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1)
+
        self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1)
        self.c3x3_b = BnActConv2d(
            in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3,
@ -125,27 +140,46 @@ class DualPathBlock(nn.Module):
            self.c1x1_c2 = nn.Conv2d(num_3x3_b, inc, kernel_size=1, bias=False)
        else:
            self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1)
+            self.c1x1_c1 = None
+            self.c1x1_c2 = None

+    @torch.jit._overload_method  # noqa: F811
    def forward(self, x):
-        x_in = torch.cat(x, dim=1) if isinstance(x, tuple) else x
-        if self.has_proj:
-            if self.key_stride == 2:
-                x_s = self.c1x1_w_s2(x_in)
-            else:
-                x_s = self.c1x1_w_s1(x_in)
-            x_s1 = x_s[:, :self.num_1x1_c, :, :]
-            x_s2 = x_s[:, self.num_1x1_c:, :, :]
+        # type: (Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        pass
+
+    def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor]:
+        if isinstance(x, tuple):
+            x_in = torch.cat(x, dim=1)
        else:
+            x_in = x
+        if self.c1x1_w_s1 is None and self.c1x1_w_s2 is None:
+            # self.has_proj == False, torchscript requires condition on module == None
            x_s1 = x[0]
            x_s2 = x[1]
+        else:
+            # self.has_proj == True
+            if self.c1x1_w_s1 is not None:
+                # self.key_stride = 1
+                x_s = self.c1x1_w_s1(x_in)
+            else:
+                # self.key_stride = 2
+                x_s = self.c1x1_w_s2(x_in)
+            x_s1 = x_s[:, :self.num_1x1_c, :, :]
+            x_s2 = x_s[:, self.num_1x1_c:, :, :]
        x_in = self.c1x1_a(x_in)
        x_in = self.c3x3_b(x_in)
-        if self.b:
-            x_in = self.c1x1_c(x_in)
+        x_in = self.c1x1_c(x_in)
+        if self.c1x1_c1 is not None:
+            # self.b == True, using None check for torchscript compat
            out1 = self.c1x1_c1(x_in)
            out2 = self.c1x1_c2(x_in)
        else:
-            x_in = self.c1x1_c(x_in)
            out1 = x_in[:, :self.num_1x1_c, :, :]
            out2 = x_in[:, self.num_1x1_c:, :, :]
        resid = x_s1 + out1
@ -167,11 +201,9 @@ class DPN(nn.Module):

        # conv1
        if small:
-            blocks['conv1_1'] = InputBlock(
-                num_init_features, in_chans=in_chans, kernel_size=3, padding=1)
+            blocks['conv1_1'] = InputBlock(num_init_features, in_chans=in_chans, kernel_size=3, padding=1)
        else:
-            blocks['conv1_1'] = InputBlock(
-                num_init_features, in_chans=in_chans, kernel_size=7, padding=3)
+            blocks['conv1_1'] = InputBlock(num_init_features, in_chans=in_chans, kernel_size=7, padding=3)

        # conv2
        bw = 64 * bw_factor
@ -218,8 +250,8 @@ class DPN(nn.Module):

        # Using 1x1 conv for the FC layer to allow the extra pooling scheme
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = nn.Conv2d(
-            self.num_features * self.global_pool.feat_mult(), num_classes, kernel_size=1, bias=True)
+        num_features = self.num_features * self.global_pool.feat_mult()
+        self.classifier = nn.Conv2d(num_features, num_classes, kernel_size=1, bias=True)

    def get_classifier(self):
        return self.classifier
@ -228,10 +260,10 @@ class DPN(nn.Module):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        if num_classes:
-            self.classifier = nn.Conv2d(
-                self.num_features * self.global_pool.feat_mult(), num_classes, kernel_size=1, bias=True)
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.classifier = nn.Conv2d(num_features, num_classes, kernel_size=1, bias=True)
        else:
-            self.classifier = None
+            self.classifier = nn.Identity()

    def forward_features(self, x):
        return self.features(x)
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -24,14 +24,19 @@ An implementation of EfficienNet that covers variety of related models with effi

 Hacked together by Ross Wightman
 """
-from .efficientnet_builder import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
 from .feature_hooks import FeatureHooks
-from .registry import register_model
 from .helpers import load_pretrained, adapt_model_from_file
-from .layers import SelectAdaptivePool2d
-from timm.models.layers import create_conv2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
-
+from .layers import SelectAdaptivePool2d, create_conv2d
+from .registry import register_model

 __all__ = ['EfficientNet']

@ -373,8 +378,11 @@ class EfficientNet(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.classifier = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.classifier = nn.Linear(num_features, num_classes)
+        else:
+            self.classifier = nn.Identity()

    def forward_features(self, x):
        x = self.conv_stem(x)
@ -466,7 +474,7 @@ class EfficientNetFeatures(nn.Module):
            return self._feature_info[idx]
        return [self._feature_info[i] for i in self.out_indices]

-    def forward(self, x):
+    def forward(self, x) -> List[torch.Tensor]:
        x = self.conv_stem(x)
        x = self.bn1(x)
        x = self.act1(x)
@ -630,7 +638,7 @@ def _gen_mobilenet_v2(
        fix_stem=fix_stem_head,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=nn.ReLU6,
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
        **kwargs
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -740,7 +748,7 @@ def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pre
        num_features=round_channels(1280, channel_multiplier, 8, None),
        stem_size=32,
        channel_multiplier=channel_multiplier,
-        act_layer=Swish,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
        norm_kwargs=resolve_bn_args(kwargs),
        variant=variant,
        **kwargs,
@ -771,7 +779,7 @@ def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0
        stem_size=32,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=nn.ReLU,
+        act_layer=resolve_act_layer(kwargs, 'relu'),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -785,13 +793,13 @@ def _gen_efficientnet_condconv(
    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
    """
    arch_def = [
-      ['ds_r1_k3_s1_e1_c16_se0.25'],
-      ['ir_r2_k3_s2_e6_c24_se0.25'],
-      ['ir_r2_k5_s2_e6_c40_se0.25'],
-      ['ir_r3_k3_s2_e6_c80_se0.25'],
-      ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
-      ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
-      ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+        ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+        ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
    ]
    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
@ -801,7 +809,7 @@ def _gen_efficientnet_condconv(
        stem_size=32,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=Swish,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -841,7 +849,7 @@ def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0
        stem_size=32,
        fix_stem=True,
        channel_multiplier=channel_multiplier,
-        act_layer=nn.ReLU6,
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
        norm_kwargs=resolve_bn_args(kwargs),
        **kwargs,
    )
@ -1187,6 +1195,7 @@ def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
        pretrained=pretrained, **kwargs)
    return model

+
@register_model
 def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts """
@ -1242,8 +1251,6 @@ def efficientnet_lite4(pretrained=False, **kwargs):
    return model


-
-
@register_model
 def efficientnet_b1_pruned(pretrained=False, **kwargs):
    """ EfficientNet-B1 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf  """
@ -1275,8 +1282,6 @@ def efficientnet_b3_pruned(pretrained=False, **kwargs):
    return model


-
-
@register_model
 def tf_efficientnet_b0(pretrained=False, **kwargs):
    """ EfficientNet-B0. Tensorflow compatible variant  """
@ -1619,6 +1624,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
        pretrained=pretrained, **kwargs)
    return model

+
@register_model
 def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
@ -1764,4 +1770,3 @@ def tf_mixnet_l(pretrained=False, **kwargs):
    model = _gen_mixnet_m(
        'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
    return model
-
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -1,9 +1,9 @@
 import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from .layers.activations import sigmoid
-from .layers import create_conv2d, drop_path

+from .layers import create_conv2d, drop_path, get_act_layer
+from .layers.activations import sigmoid

 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
 # papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
@ -52,6 +52,13 @@ def resolve_se_args(kwargs, in_chs, act_layer=None):
    return se_kwargs


+def resolve_act_layer(kwargs, default='relu'):
+    act_layer = kwargs.pop('act_layer', default)
+    if isinstance(act_layer, str):
+        act_layer = get_act_layer(act_layer)
+    return act_layer
+
+
 def make_divisible(v, divisor=8, min_value=None):
    min_value = min_value or divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -1,13 +1,15 @@
 import logging
 import math
 import re
-from collections.__init__ import OrderedDict
+from collections import OrderedDict
 from copy import deepcopy

 import torch.nn as nn
-from .layers import CondConv2d, get_condconv_initializer
-from .layers.activations import HardSwish, Swish
+
 from .efficientnet_blocks import *
+from .layers import CondConv2d, get_condconv_initializer
+
+__all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights"]


 def _parse_ksize(ss):
@ -57,13 +59,13 @@ def _decode_block_str(block_str):
            key = op[0]
            v = op[1:]
            if v == 're':
-                value = nn.ReLU
+                value = get_act_layer('relu')
            elif v == 'r6':
-                value = nn.ReLU6
+                value = get_act_layer('relu6')
            elif v == 'hs':
-                value = HardSwish
+                value = get_act_layer('hard_swish')
            elif v == 'sw':
-                value = Swish
+                value = get_act_layer('swish')
            else:
                continue
            options[key] = value
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@ -1,5 +1,6 @@
 from .registry import is_model, is_model_in_modules, model_entrypoint
 from .helpers import load_checkpoint
+from .layers import set_layer_config


 def create_model(
@ -8,6 +9,9 @@ def create_model(
        num_classes=1000,
        in_chans=3,
        checkpoint_path='',
+        scriptable=None,
+        exportable=None,
+        no_jit=None,
        **kwargs):
    """Create a model

@ -17,13 +21,16 @@ def create_model(
        num_classes (int): number of classes for final fully connected layer (default: 1000)
        in_chans (int): number of input channels / colors (default: 3)
        checkpoint_path (str): path of checkpoint to load after model is initialized
+        scriptable (bool): set layer config so that model is jit scriptable (not working for all models yet)
+        exportable (bool): set layer config so that model is traceable / ONNX exportable (not fully impl/obeyed yet)
+        no_jit (bool): set layer config so that model doesn't utilize jit scripted layers (so far activations only)

    Keyword Args:
        drop_rate (float): dropout rate for training (default: 0.0)
        global_pool (str): global pool type (default: 'avg')
        **: other kwargs are model specific
    """
-    margs = dict(pretrained=pretrained, num_classes=num_classes, in_chans=in_chans)
+    model_args = dict(pretrained=pretrained, num_classes=num_classes, in_chans=in_chans)

    # Only EfficientNet and MobileNetV3 models have support for batchnorm params or drop_connect_rate passed as args
    is_efficientnet = is_model_in_modules(model_name, ['efficientnet', 'mobilenetv3'])
@ -47,11 +54,12 @@ def create_model(
    if kwargs.get('drop_path_rate', None) is None:
        kwargs.pop('drop_path_rate', None)

-    if is_model(model_name):
-        create_fn = model_entrypoint(model_name)
-        model = create_fn(**margs, **kwargs)
-    else:
-        raise RuntimeError('Unknown model (%s)' % model_name)
+    with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit):
+        if is_model(model_name):
+            create_fn = model_entrypoint(model_name)
+            model = create_fn(**model_args, **kwargs)
+        else:
+            raise RuntimeError('Unknown model (%s)' % model_name)

    if checkpoint_path:
        load_checkpoint(model, checkpoint_path)
--- a/timm/models/feature_hooks.py
+++ b/timm/models/feature_hooks.py
@ -1,5 +1,8 @@
+import torch
+
 from collections import defaultdict, OrderedDict
 from functools import partial
+from typing import List


 class FeatureHooks:
@ -25,7 +28,7 @@ class FeatureHooks:
            x = x[0]  # unwrap input tuple
        self._feature_outputs[x.device][name] = x

-    def get_output(self, device):
-        output = tuple(self._feature_outputs[device].values())[::-1]
+    def get_output(self, device) -> List[torch.tensor]:
+        output = list(self._feature_outputs[device].values())
        self._feature_outputs[device] = OrderedDict()  # clear after reading
        return output
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@ -3,17 +3,11 @@ This file evolved from https://github.com/pytorch/vision 'resnet.py' with (SE)-R
 and ports of Gluon variations (https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnet.py) 
 by Ross Wightman
 """
-import math

-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained
 from .layers import SEModule
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-
+from .registry import register_model
 from .resnet import ResNet, Bottleneck, BasicBlock


@ -202,8 +196,8 @@ def gluon_resnet50_v1e(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
    model = ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
                   stem_width=64, stem_type='deep', avg_down=True, **kwargs)
    model.default_cfg = default_cfg
-    #if pretrained:
-    #    load_pretrained(model, default_cfg, num_classes, in_chans)
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model


--- a/timm/models/gluon_xception.py
+++ b/timm/models/gluon_xception.py
@ -6,15 +6,15 @@ Original PyTorch DeepLab impl: https://github.com/jfzhang95/pytorch-deeplab-xcep

 Hacked together by Ross Wightman
 """
-import torch
+from collections import OrderedDict
+
 import torch.nn as nn
 import torch.nn.functional as F
-from collections import OrderedDict

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model

 __all__ = ['Xception65', 'Xception71']

@ -47,7 +47,6 @@ default_cfgs = {
    }
 }

-
 """ PADDING NOTES
 The original PyTorch and Gluon impl of these models dutifully reproduced the 
 aligned padding added to Tensorflow models for Deeplab. This padding was compensating
@ -223,7 +222,7 @@ class Xception65(nn.Module):
            norm_layer=norm_layer, norm_kwargs=norm_kwargs, start_with_relu=True, grow_first=True, is_last=True)

        # Middle flow
-        self.mid = nn.Sequential(OrderedDict([('block%d' % i,  Block(
+        self.mid = nn.Sequential(OrderedDict([('block%d' % i, Block(
            728, 728, num_reps=3, stride=1, dilation=middle_block_dilation,
            norm_layer=norm_layer, norm_kwargs=norm_kwargs, start_with_relu=True, grow_first=True))
                                              for i in range(4, 20)]))
@ -333,7 +332,7 @@ class Xception71(nn.Module):
            exit_block_dilations = (2, 4)
        else:
            raise NotImplementedError
-        
+
        # Entry flow
        self.conv1 = nn.Conv2d(in_chans, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = norm_layer(num_features=32, **norm_kwargs)
@ -394,7 +393,11 @@ class Xception71(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.fc = nn.Linear(num_features, num_classes)
+        else:
+            self.fc = nn.Identity()

    def forward_features(self, x):
        # Entry flow
@ -465,4 +468,3 @@ def gluon_xception71(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
-
--- a/timm/models/inception_resnet_v2.py
+++ b/timm/models/inception_resnet_v2.py
@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .registry import register_model

 __all__ = ['InceptionResnetV2']

@ -193,7 +193,6 @@ class Mixed_7a(nn.Module):


 class Block8(nn.Module):
-    __constants__ = ['relu']  # for pre 1.4 torchscript compat

    def __init__(self, scale=1.0, no_relu=False):
        super(Block8, self).__init__()
@ -296,8 +295,11 @@ class InceptionResnetV2(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_classes = num_classes
-        self.classif = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.classif = nn.Linear(num_features, num_classes)
+        else:
+            self.classif = nn.Identity()

    def forward_features(self, x):
        x = self.conv2d_1a(x)
--- a/timm/models/inception_v3.py
+++ b/timm/models/inception_v3.py
@ -2,12 +2,11 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from .helpers import load_pretrained
+from .registry import register_model
 from .layers import trunc_normal_, SelectAdaptivePool2d
-from timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD

-__all__ = []

 def _cfg(url='', **kwargs):
    return {
--- a/timm/models/inception_v4.py
+++ b/timm/models/inception_v4.py
@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .registry import register_model

 __all__ = ['InceptionV4']

@ -280,8 +280,11 @@ class InceptionV4(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_classes = num_classes
-        self.last_linear = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.last_linear = nn.Linear(num_features, num_classes)
+        else:
+            self.last_linear = nn.Identity()

    def forward_features(self, x):
        return self.features(x)
@ -303,6 +306,3 @@ def inception_v4(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
-
-
-
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -1,22 +1,29 @@
-from .padding import get_padding
-from .pool2d_same import AvgPool2dSame
-from .conv2d_same import Conv2dSame
-from .conv_bn_act import ConvBnAct
-from .mixed_conv2d import MixedConv2d
-from .cond_conv2d import CondConv2d, get_condconv_initializer
-from .pool2d_same import create_pool2d
-from .create_conv2d import create_conv2d
-from .create_attn import create_attn
-from .selective_kernel import SelectiveKernelConv
-from .se import SEModule
-from .eca import EcaModule, CecaModule
 from .activations import *
 from .adaptive_avgmax_pool import \
    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
-from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
-from .test_time_pool import TestTimePoolHead, apply_test_time_pool
-from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
 from .anti_aliasing import AntiAliasDownsampleLayer
-from .space_to_depth import SpaceToDepthModule
 from .blur_pool import BlurPool2d
+from .cond_conv2d import CondConv2d, get_condconv_initializer
+from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\
+    set_layer_config
+from .conv2d_same import Conv2dSame
+from .conv_bn_act import ConvBnAct
+from .create_act import create_act_layer, get_act_layer, get_act_fn
+from .create_attn import create_attn
+from .create_conv2d import create_conv2d
+from .create_norm_act import create_norm_act, get_norm_act_layer
+from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
+from .eca import EcaModule, CecaModule
+from .evo_norm import EvoNormBatch2d, EvoNormSample2d
+from .inplace_abn import InplaceAbn
+from .mixed_conv2d import MixedConv2d
+from .norm_act import BatchNormAct2d
+from .padding import get_padding
+from .pool2d_same import AvgPool2dSame, create_pool2d
+from .se import SEModule
+from .selective_kernel import SelectiveKernelConv
+from .separable_conv import SeparableConv2d, SeparableConvBnAct
+from .space_to_depth import SpaceToDepthModule
+from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
+from .test_time_pool import TestTimePoolHead, apply_test_time_pool
 from .weight_init import trunc_normal_
--- a/timm/models/layers/activations.py
+++ b/timm/models/layers/activations.py
@ -6,85 +6,15 @@ easily be swapped. All have an `inplace` arg even if not used.
 Hacked together by Ross Wightman
 """

-
 import torch
 from torch import nn as nn
 from torch.nn import functional as F


-_USE_MEM_EFFICIENT_ISH = True
-if _USE_MEM_EFFICIENT_ISH:
-    # This version reduces memory overhead of Swish during training by
-    # recomputing torch.sigmoid(x) in backward instead of saving it.
-    @torch.jit.script
-    def swish_jit_fwd(x):
-        return x.mul(torch.sigmoid(x))
-
-
-    @torch.jit.script
-    def swish_jit_bwd(x, grad_output):
-        x_sigmoid = torch.sigmoid(x)
-        return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
-
-
-    class SwishJitAutoFn(torch.autograd.Function):
-        """ torch.jit.script optimised Swish
-        Inspired by conversation btw Jeremy Howard & Adam Pazske
-        https://twitter.com/jeremyphoward/status/1188251041835315200
-        """
-
-        @staticmethod
-        def forward(ctx, x):
-            ctx.save_for_backward(x)
-            return swish_jit_fwd(x)
-
-        @staticmethod
-        def backward(ctx, grad_output):
-            x = ctx.saved_tensors[0]
-            return swish_jit_bwd(x, grad_output)
-
-
-    def swish(x, _inplace=False):
-        return SwishJitAutoFn.apply(x)
-
-
-    @torch.jit.script
-    def mish_jit_fwd(x):
-        return x.mul(torch.tanh(F.softplus(x)))
-
-
-    @torch.jit.script
-    def mish_jit_bwd(x, grad_output):
-        x_sigmoid = torch.sigmoid(x)
-        x_tanh_sp = F.softplus(x).tanh()
-        return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
-
-
-    class MishJitAutoFn(torch.autograd.Function):
-        @staticmethod
-        def forward(ctx, x):
-            ctx.save_for_backward(x)
-            return mish_jit_fwd(x)
-
-        @staticmethod
-        def backward(ctx, grad_output):
-            x = ctx.saved_tensors[0]
-            return mish_jit_bwd(x, grad_output)
-
-    def mish(x, _inplace=False):
-        return MishJitAutoFn.apply(x)
-
-else:
-    def swish(x, inplace: bool = False):
-        """Swish - Described in: https://arxiv.org/abs/1710.05941
-        """
-        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
-
-
-    def mish(x, _inplace: bool = False):
-        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
-        """
-        return x.mul(F.softplus(x).tanh())
+def swish(x, inplace: bool = False):
+    """Swish - Described in: https://arxiv.org/abs/1710.05941
+    """
+    return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())


 class Swish(nn.Module):
@ -96,13 +26,21 @@ class Swish(nn.Module):
        return swish(x, self.inplace)


+def mish(x, inplace: bool = False):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    NOTE: I don't have a working inplace variant
+    """
+    return x.mul(F.softplus(x).tanh())
+
+
 class Mish(nn.Module):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    """
    def __init__(self, inplace: bool = False):
        super(Mish, self).__init__()
-        self.inplace = inplace

    def forward(self, x):
-        return mish(x, self.inplace)
+        return mish(x)


 def sigmoid(x, inplace: bool = False):
@ -162,3 +100,22 @@ class HardSigmoid(nn.Module):
    def forward(self, x):
        return hard_sigmoid(x, self.inplace)

+
+def hard_mish(x, inplace: bool = False):
+    """ Hard Mish
+    Experimental, based on notes by Mish author Diganta Misra at
+      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
+    """
+    if inplace:
+        return x.mul_(0.5 * (x + 2).clamp(min=0, max=2))
+    else:
+        return 0.5 * x * (x + 2).clamp(min=0, max=2)
+
+
+class HardMish(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardMish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return hard_mish(x, self.inplace)
--- a/timm/models/layers/activations_jit.py
+++ b/timm/models/layers/activations_jit.py
@ -0,0 +1,90 @@
+""" Activations
+
+A collection of jit-scripted activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
+currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
+versions if they contain in-place ops.
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+@torch.jit.script
+def swish_jit(x, inplace: bool = False):
+    """Swish - Described in: https://arxiv.org/abs/1710.05941
+    """
+    return x.mul(x.sigmoid())
+
+
+@torch.jit.script
+def mish_jit(x, _inplace: bool = False):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    """
+    return x.mul(F.softplus(x).tanh())
+
+
+class SwishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(SwishJit, self).__init__()
+
+    def forward(self, x):
+        return swish_jit(x)
+
+
+class MishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(MishJit, self).__init__()
+
+    def forward(self, x):
+        return mish_jit(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit(x, inplace: bool = False):
+    # return F.relu6(x + 3.) / 6.
+    return (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
+
+
+class HardSigmoidJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSigmoidJit, self).__init__()
+
+    def forward(self, x):
+        return hard_sigmoid_jit(x)
+
+
+@torch.jit.script
+def hard_swish_jit(x, inplace: bool = False):
+    # return x * (F.relu6(x + 3.) / 6)
+    return x * (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
+
+
+class HardSwishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSwishJit, self).__init__()
+
+    def forward(self, x):
+        return hard_swish_jit(x)
+
+
+@torch.jit.script
+def hard_mish_jit(x, inplace: bool = False):
+    """ Hard Mish
+    Experimental, based on notes by Mish author Diganta Misra at
+      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
+    """
+    return 0.5 * x * (x + 2).clamp(min=0, max=2)
+
+
+class HardMishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardMishJit, self).__init__()
+
+    def forward(self, x):
+        return hard_mish_jit(x)
--- a/timm/models/layers/activations_me.py
+++ b/timm/models/layers/activations_me.py
@ -0,0 +1,208 @@
+""" Activations (memory-efficient w/ custom autograd)
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+These activations are not compatible with jit scripting or ONNX export of the model, please use either
+the JIT or basic versions of the activations.
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+@torch.jit.script
+def swish_jit_fwd(x):
+    return x.mul(torch.sigmoid(x))
+
+
+@torch.jit.script
+def swish_jit_bwd(x, grad_output):
+    x_sigmoid = torch.sigmoid(x)
+    return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+class SwishJitAutoFn(torch.autograd.Function):
+    """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
+    Inspired by conversation btw Jeremy Howard & Adam Pazske
+    https://twitter.com/jeremyphoward/status/1188251041835315200
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return swish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return swish_jit_bwd(x, grad_output)
+
+
+def swish_me(x, inplace=False):
+    return SwishJitAutoFn.apply(x)
+
+
+class SwishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(SwishMe, self).__init__()
+
+    def forward(self, x):
+        return SwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def mish_jit_fwd(x):
+    return x.mul(torch.tanh(F.softplus(x)))
+
+
+@torch.jit.script
+def mish_jit_bwd(x, grad_output):
+    x_sigmoid = torch.sigmoid(x)
+    x_tanh_sp = F.softplus(x).tanh()
+    return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+class MishJitAutoFn(torch.autograd.Function):
+    """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    A memory efficient, jit scripted variant of Mish
+    """
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return mish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return mish_jit_bwd(x, grad_output)
+
+
+def mish_me(x, inplace=False):
+    return MishJitAutoFn.apply(x)
+
+
+class MishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(MishMe, self).__init__()
+
+    def forward(self, x):
+        return MishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_fwd(x, inplace: bool = False):
+    return (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
+    return grad_output * m
+
+
+class HardSigmoidJitAutoFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_sigmoid_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_sigmoid_jit_bwd(x, grad_output)
+
+
+def hard_sigmoid_me(x, inplace: bool = False):
+    return HardSigmoidJitAutoFn.apply(x)
+
+
+class HardSigmoidMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSigmoidMe, self).__init__()
+
+    def forward(self, x):
+        return HardSigmoidJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_swish_jit_fwd(x):
+    return x * (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_swish_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * (x >= 3.)
+    m = torch.where((x >= -3.) & (x <= 3.),  x / 3. + .5, m)
+    return grad_output * m
+
+
+class HardSwishJitAutoFn(torch.autograd.Function):
+    """A memory efficient, jit-scripted HardSwish activation"""
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_swish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_swish_jit_bwd(x, grad_output)
+
+
+def hard_swish_me(x, inplace=False):
+    return HardSwishJitAutoFn.apply(x)
+
+
+class HardSwishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSwishMe, self).__init__()
+
+    def forward(self, x):
+        return HardSwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_mish_jit_fwd(x):
+    return 0.5 * x * (x + 2).clamp(min=0, max=2)
+
+
+@torch.jit.script
+def hard_mish_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * (x >= -2.)
+    m = torch.where((x >= -2.) & (x <= 0.), x + 1., m)
+    return grad_output * m
+
+
+class HardMishJitAutoFn(torch.autograd.Function):
+    """ A memory efficient, jit scripted variant of Hard Mish
+    Experimental, based on notes by Mish author Diganta Misra at
+      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
+    """
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_mish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_mish_jit_bwd(x, grad_output)
+
+
+def hard_mish_me(x, inplace: bool = False):
+    return HardMishJitAutoFn.apply(x)
+
+
+class HardMishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardMishMe, self).__init__()
+
+    def forward(self, x):
+        return HardMishJitAutoFn.apply(x)
+
+
+
--- a/timm/models/layers/cond_conv2d.py
+++ b/timm/models/layers/cond_conv2d.py
@ -15,7 +15,7 @@ from torch.nn import functional as F

 from .helpers import tup_pair
 from .conv2d_same import conv2d_same
-from timm.models.layers.padding import get_padding_value
+from .padding import get_padding_value


 def get_condconv_initializer(initializer, num_experts, expert_shape):
@ -38,7 +38,7 @@ class CondConv2d(nn.Module):
    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
    https://github.com/pytorch/pytorch/issues/17983
    """
-    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
+    __constants__ = ['in_channels', 'out_channels', 'dynamic_padding']

    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
--- a/timm/models/layers/config.py
+++ b/timm/models/layers/config.py
@ -0,0 +1,115 @@
+""" Model / Layer Config singleton state
+"""
+from typing import Any, Optional
+
+__all__ = [
+    'is_exportable', 'is_scriptable', 'is_no_jit',
+    'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config'
+]
+
+# Set to True if prefer to have layers with no jit optimization (includes activations)
+_NO_JIT = False
+
+# Set to True if prefer to have activation layers with no jit optimization
+# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
+# the jit flags so far are activations. This will change as more layers are updated and/or added.
+_NO_ACTIVATION_JIT = False
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+# Set to True if wanting to use torch.jit.script on a model
+_SCRIPTABLE = False
+
+
+def is_no_jit():
+    return _NO_JIT
+
+
+class set_no_jit:
+    def __init__(self, mode: bool) -> None:
+        global _NO_JIT
+        self.prev = _NO_JIT
+        _NO_JIT = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _NO_JIT
+        _NO_JIT = self.prev
+        return False
+
+
+def is_exportable():
+    return _EXPORTABLE
+
+
+class set_exportable:
+    def __init__(self, mode: bool) -> None:
+        global _EXPORTABLE
+        self.prev = _EXPORTABLE
+        _EXPORTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _EXPORTABLE
+        _EXPORTABLE = self.prev
+        return False
+
+
+def is_scriptable():
+    return _SCRIPTABLE
+
+
+class set_scriptable:
+    def __init__(self, mode: bool) -> None:
+        global _SCRIPTABLE
+        self.prev = _SCRIPTABLE
+        _SCRIPTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        _SCRIPTABLE = self.prev
+        return False
+
+
+class set_layer_config:
+    """ Layer config context manager that allows setting all layer config flags at once.
+    If a flag arg is None, it will not change the current value.
+    """
+    def __init__(
+            self,
+            scriptable: Optional[bool] = None,
+            exportable: Optional[bool] = None,
+            no_jit: Optional[bool] = None,
+            no_activation_jit: Optional[bool] = None):
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
+        if scriptable is not None:
+            _SCRIPTABLE = scriptable
+        if exportable is not None:
+            _EXPORTABLE = exportable
+        if no_jit is not None:
+            _NO_JIT = no_jit
+        if no_activation_jit is not None:
+            _NO_ACTIVATION_JIT = no_activation_jit
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
+        return False
--- a/timm/models/layers/conv2d_same.py
+++ b/timm/models/layers/conv2d_same.py
@ -7,8 +7,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from typing import Tuple, Optional

-from timm.models.layers.padding import get_padding_value
-from .padding import pad_same
+from .padding import pad_same, get_padding_value


 def conv2d_same(
--- a/timm/models/layers/conv_bn_act.py
+++ b/timm/models/layers/conv_bn_act.py
@ -4,33 +4,28 @@ Hacked together by Ross Wightman
 """
 from torch import nn as nn

-from timm.models.layers import get_padding
+from .create_conv2d import create_conv2d
+from .create_norm_act import convert_norm_act_type


 class ConvBnAct(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1,
-                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, act_layer=nn.ReLU, apply_act=True,
+                 drop_block=None, aa_layer=None):
        super(ConvBnAct, self).__init__()
-        padding = get_padding(kernel_size, stride, dilation)  # assuming PyTorch style padding for this block
        use_aa = aa_layer is not None
-        self.conv = nn.Conv2d(
-            in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=1 if use_aa else stride,
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
            padding=padding, dilation=dilation, groups=groups, bias=False)
-        self.bn = norm_layer(out_channels)
+
+        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
+        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
-        self.drop_block = drop_block
-        if act_layer is not None:
-            self.act = act_layer(inplace=True)
-        else:
-            self.act = None

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
-        if self.drop_block is not None:
-            x = self.drop_block(x)
-        if self.act is not None:
-            x = self.act(x)
        if self.aa is not None:
            x = self.aa(x)
        return x
--- a/timm/models/layers/create_act.py
+++ b/timm/models/layers/create_act.py
@ -0,0 +1,114 @@
+from .activations import *
+from .activations_jit import *
+from .activations_me import *
+from .config import is_exportable, is_scriptable, is_no_jit
+
+
+_ACT_FN_DEFAULT = dict(
+    swish=swish,
+    mish=mish,
+    relu=F.relu,
+    relu6=F.relu6,
+    leaky_relu=F.leaky_relu,
+    elu=F.elu,
+    prelu=F.prelu,
+    celu=F.celu,
+    selu=F.selu,
+    gelu=F.gelu,
+    sigmoid=sigmoid,
+    tanh=tanh,
+    hard_sigmoid=hard_sigmoid,
+    hard_swish=hard_swish,
+    hard_mish=hard_mish,
+)
+
+_ACT_FN_JIT = dict(
+    swish=swish_jit,
+    mish=mish_jit,
+    hard_sigmoid=hard_sigmoid_jit,
+    hard_swish=hard_swish_jit,
+    hard_mish=hard_mish_jit
+)
+
+_ACT_FN_ME = dict(
+    swish=swish_me,
+    mish=mish_me,
+    hard_sigmoid=hard_sigmoid_me,
+    hard_swish=hard_swish_me,
+    hard_mish=hard_mish_me,
+)
+
+_ACT_LAYER_DEFAULT = dict(
+    swish=Swish,
+    mish=Mish,
+    relu=nn.ReLU,
+    relu6=nn.ReLU6,
+    elu=nn.ELU,
+    prelu=nn.PReLU,
+    celu=nn.CELU,
+    selu=nn.SELU,
+    gelu=nn.GELU,
+    sigmoid=Sigmoid,
+    tanh=Tanh,
+    hard_sigmoid=HardSigmoid,
+    hard_swish=HardSwish,
+    hard_mish=HardMish,
+)
+
+_ACT_LAYER_JIT = dict(
+    swish=SwishJit,
+    mish=MishJit,
+    hard_sigmoid=HardSigmoidJit,
+    hard_swish=HardSwishJit,
+    hard_mish=HardMishJit
+)
+
+_ACT_LAYER_ME = dict(
+    swish=SwishMe,
+    mish=MishMe,
+    hard_sigmoid=HardSigmoidMe,
+    hard_swish=HardSwishMe,
+    hard_mish=HardMishMe,
+)
+
+
+def get_act_fn(name='relu'):
+    """ Activation Function Factory
+    Fetching activation fns by name with this function allows export or torch script friendly
+    functions to be returned dynamically based on current config.
+    """
+    if not name:
+        return None
+    if not (is_no_jit() or is_exportable() or is_scriptable()):
+        # If not exporting or scripting the model, first look for a memory-efficient version with
+        # custom autograd, then fallback
+        if name in _ACT_FN_ME:
+            return _ACT_FN_ME[name]
+    if not is_no_jit():
+        if name in _ACT_FN_JIT:
+            return _ACT_FN_JIT[name]
+    return _ACT_FN_DEFAULT[name]
+
+
+def get_act_layer(name='relu'):
+    """ Activation Layer Factory
+    Fetching activation layers by name with this function allows export or torch script friendly
+    functions to be returned dynamically based on current config.
+    """
+    if not name:
+        return None
+    if not (is_no_jit() or is_exportable() or is_scriptable()):
+        if name in _ACT_LAYER_ME:
+            return _ACT_LAYER_ME[name]
+    if not is_no_jit():
+        if name in _ACT_LAYER_JIT:
+            return _ACT_LAYER_JIT[name]
+    return _ACT_LAYER_DEFAULT[name]
+
+
+def create_act_layer(name, inplace=False, **kwargs):
+    act_layer = get_act_layer(name)
+    if act_layer is not None:
+        return act_layer(inplace=inplace, **kwargs)
+    else:
+        return None
--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -3,7 +3,7 @@
 Hacked together by Ross Wightman
 """
 import torch
-from .se import SEModule
+from .se import SEModule, EffectiveSEModule
 from .eca import EcaModule, CecaModule
 from .cbam import CbamModule, LightCbamModule

@ -15,6 +15,8 @@ def create_attn(attn_type, channels, **kwargs):
            attn_type = attn_type.lower()
            if attn_type == 'se':
                module_cls = SEModule
+            elif attn_type == 'ese':
+                module_cls = EffectiveSEModule
            elif attn_type == 'eca':
                module_cls = EcaModule
            elif attn_type == 'ceca':
--- a/timm/models/layers/create_conv2d.py
+++ b/timm/models/layers/create_conv2d.py
@ -8,23 +8,23 @@ from .cond_conv2d import CondConv2d
 from .conv2d_same import create_conv2d_pad


-def create_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
    """ Select a 2d convolution implementation based on arguments
    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.

    Used extensively by EfficientNet, MobileNetv3 and related networks.
    """
-    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
    if isinstance(kernel_size, list):
        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
+        assert 'groups' not in kwargs  # MixedConv groups are defined by kernel list
        # We're going to use only lists for defining the MixedConv2d kernel groups,
        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
    else:
        depthwise = kwargs.pop('depthwise', False)
-        groups = out_chs if depthwise else 1
+        groups = out_channels if depthwise else kwargs.pop('groups', 1)
        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
        else:
-            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+            m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
    return m
--- a/timm/models/layers/create_norm_act.py
+++ b/timm/models/layers/create_norm_act.py
@ -0,0 +1,64 @@
+import types
+import functools
+
+import torch
+import torch.nn as nn
+
+from .evo_norm import EvoNormBatch2d, EvoNormSample2d
+from .norm_act import BatchNormAct2d, GroupNormAct
+from .inplace_abn import InplaceAbn
+
+_NORM_ACT_TYPES = {BatchNormAct2d, GroupNormAct, EvoNormBatch2d, EvoNormSample2d, InplaceAbn}
+
+
+def get_norm_act_layer(layer_class):
+    layer_class = layer_class.replace('_', '').lower()
+    if layer_class.startswith("batchnorm"):
+        layer = BatchNormAct2d
+    elif layer_class.startswith("groupnorm"):
+        layer = GroupNormAct
+    elif layer_class == "evonormbatch":
+        layer = EvoNormBatch2d
+    elif layer_class == "evonormsample":
+        layer = EvoNormSample2d
+    elif layer_class == "iabn" or layer_class == "inplaceabn":
+        layer = InplaceAbn
+    else:
+        assert False, "Invalid norm_act layer (%s)" % layer_class
+    return layer
+
+
+def create_norm_act(layer_type, num_features, apply_act=True, jit=False, **kwargs):
+    layer_parts = layer_type.split('-')  # e.g. batchnorm-leaky_relu
+    assert len(layer_parts) in (1, 2)
+    layer = get_norm_act_layer(layer_parts[0])
+    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection?
+    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
+    if jit:
+        layer_instance = torch.jit.script(layer_instance)
+    return layer_instance
+
+
+def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None):
+    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
+    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
+    norm_act_args = norm_kwargs.copy() if norm_kwargs else {}
+    if isinstance(norm_layer, str):
+        norm_act_layer = get_norm_act_layer(norm_layer)
+    elif norm_layer in _NORM_ACT_TYPES:
+        norm_act_layer = norm_layer
+    elif isinstance(norm_layer,  (types.FunctionType, functools.partial)):
+        # assuming this is a lambda/fn/bound partial that creates norm_act layer
+        norm_act_layer = norm_layer
+    else:
+        type_name = norm_layer.__name__.lower()
+        if type_name.startswith('batchnorm'):
+            norm_act_layer = BatchNormAct2d
+        elif type_name.startswith('groupnorm'):
+            norm_act_layer = GroupNormAct
+        else:
+            assert False, f"No equivalent norm_act layer for {type_name}"
+        # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
+        # Newer models will use `apply_act` and likely have `act_layer` arg bound to relevant NormAct types.
+        norm_act_args.update(dict(act_layer=act_layer))
+    return norm_act_layer, norm_act_args
--- a/timm/models/layers/drop.py
+++ b/timm/models/layers/drop.py
@ -17,8 +17,6 @@ Hacked together by Ross Wightman
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import numpy as np
-import math


 def drop_block_2d(
--- a/timm/models/layers/evo_norm.py
+++ b/timm/models/layers/evo_norm.py
@ -0,0 +1,81 @@
+"""EvoNormB0 (Batched) and EvoNormS0 (Sample) in PyTorch
+
+An attempt at getting decent performing EvoNorms running in PyTorch.
+While currently faster than other impl, still quite a ways off the built-in BN
+in terms of memory usage and throughput (roughly 5x mem, 1/2 - 1/3x speed).
+
+Still very much a WIP, fiddling with buffer usage, in-place/jit optimizations, and layouts.
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+
+
+class EvoNormBatch2d(nn.Module):
+    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, drop_block=None):
+        super(EvoNormBatch2d, self).__init__()
+        self.apply_act = apply_act  # apply activation (non-linearity)
+        self.momentum = momentum
+        self.eps = eps
+        param_shape = (1, num_features, 1, 1)
+        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
+        if apply_act:
+            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.register_buffer('running_var', torch.ones(1, num_features, 1, 1))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.ones_(self.weight)
+        nn.init.zeros_(self.bias)
+        if self.apply_act:
+            nn.init.ones_(self.v)
+
+    def forward(self, x):
+        assert x.dim() == 4, 'expected 4D input'
+        x_type = x.dtype
+        if self.training:
+            var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=True)
+            self.running_var.copy_(self.momentum * var.detach() + (1 - self.momentum) * self.running_var)
+        else:
+            var = self.running_var
+
+        if self.apply_act:
+            v = self.v.to(dtype=x_type)
+            d = (x * v) + (x.var(dim=(2, 3), unbiased=False, keepdim=True) + self.eps).sqrt().to(dtype=x_type)
+            d = d.max((var + self.eps).sqrt().to(dtype=x_type))
+            x = x / d
+        return x * self.weight + self.bias
+
+
+class EvoNormSample2d(nn.Module):
+    def __init__(self, num_features, apply_act=True, groups=8, eps=1e-5, drop_block=None):
+        super(EvoNormSample2d, self).__init__()
+        self.apply_act = apply_act  # apply activation (non-linearity)
+        self.groups = groups
+        self.eps = eps
+        param_shape = (1, num_features, 1, 1)
+        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
+        if apply_act:
+            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.ones_(self.weight)
+        nn.init.zeros_(self.bias)
+        if self.apply_act:
+            nn.init.ones_(self.v)
+
+    def forward(self, x):
+        assert x.dim() == 4, 'expected 4D input'
+        B, C, H, W = x.shape
+        assert C % self.groups == 0
+        if self.apply_act:
+            n = (x * self.v).sigmoid().reshape(B, self.groups, -1)
+            x = x.reshape(B, self.groups, -1)
+            x = n / (x.var(dim=-1, unbiased=False, keepdim=True) + self.eps).sqrt()
+            x = x.reshape(B, C, H, W)
+        return x * self.weight + self.bias
--- a/timm/models/layers/inplace_abn.py
+++ b/timm/models/layers/inplace_abn.py
@ -0,0 +1,85 @@
+import torch
+from torch import nn as nn
+
+try:
+    from inplace_abn.functions import inplace_abn, inplace_abn_sync
+    has_iabn = True
+except ImportError:
+    has_iabn = False
+
+    def inplace_abn(x, weight, bias, running_mean, running_var,
+                    training=True, momentum=0.1, eps=1e-05, activation="leaky_relu", activation_param=0.01):
+        raise ImportError(
+            "Please install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
+
+    def inplace_abn_sync(**kwargs):
+        inplace_abn(**kwargs)
+
+
+class InplaceAbn(nn.Module):
+    """Activated Batch Normalization
+
+    This gathers a BatchNorm and an activation function in a single module
+
+    Parameters
+    ----------
+    num_features : int
+        Number of feature channels in the input and output.
+    eps : float
+        Small constant to prevent numerical issues.
+    momentum : float
+        Momentum factor applied to compute running statistics.
+    affine : bool
+        If `True` apply learned scale and shift transformation after normalization.
+    act_layer : str or nn.Module type
+        Name or type of the activation functions, one of: `leaky_relu`, `elu`
+    act_param : float
+        Negative slope for the `leaky_relu` activation.
+    """
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
+                 act_layer="leaky_relu", act_param=0.01, drop_block=None,):
+        super(InplaceAbn, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if apply_act:
+            if isinstance(act_layer, str):
+                assert act_layer in ('leaky_relu', 'elu', 'identity')
+                self.act_name = act_layer
+            else:
+                # convert act layer passed as type to string
+                if isinstance(act_layer, nn.ELU):
+                    self.act_name = 'elu'
+                elif isinstance(act_layer, nn.LeakyReLU):
+                    self.act_name = 'leaky_relu'
+                else:
+                    assert False, f'Invalid act layer {act_layer.__name__} for IABN'
+        else:
+            self.act_name = 'identity'
+        self.act_param = act_param
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x):
+        output = inplace_abn(
+            x, self.weight, self.bias, self.running_mean, self.running_var,
+            self.training, self.momentum, self.eps, self.act_name, self.act_param)
+        if isinstance(output, tuple):
+            output = output[0]
+        return output
--- a/timm/models/layers/norm_act.py
+++ b/timm/models/layers/norm_act.py
@ -0,0 +1,85 @@
+""" Normalization + Activation Layers
+"""
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .create_act import get_act_layer
+
+
+class BatchNormAct2d(nn.BatchNorm2d):
+    """BatchNorm + Activation
+
+    This module performs BatchNorm + Activation in a manner that will remain backwards
+    compatible with weights trained with separate bn, act. This is why we inherit from BN
+    instead of composing it as a .bn member.
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+        super(BatchNormAct2d, self).__init__(
+            num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
+        if isinstance(act_layer, str):
+            act_layer = get_act_layer(act_layer)
+        if act_layer is not None and apply_act:
+            self.act = act_layer(inplace=inplace)
+        else:
+            self.act = None
+
+    def _forward_jit(self, x):
+        """ A cut & paste of the contents of the PyTorch BatchNorm2d forward function
+        """
+        # exponential_average_factor is self.momentum set to
+        # (when it is available) only so that if gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            # TODO: if statement only here to tell the jit to skip emitting this when it is None
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        x = F.batch_norm(
+                x, self.running_mean, self.running_var, self.weight, self.bias,
+                self.training or not self.track_running_stats,
+                exponential_average_factor, self.eps)
+        return x
+
+    @torch.jit.ignore
+    def _forward_python(self, x):
+        return super(BatchNormAct2d, self).forward(x)
+
+    def forward(self, x):
+        # FIXME cannot call parent forward() and maintain jit.script compatibility?
+        if torch.jit.is_scripting():
+            x = self._forward_jit(x)
+        else:
+            x = self._forward_python(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class GroupNormAct(nn.GroupNorm):
+
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True,
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+        super(GroupNormAct, self).__init__(num_groups, num_channels, eps=eps, affine=affine)
+        if isinstance(act_layer, str):
+            act_layer = get_act_layer(act_layer)
+        if act_layer is not None and apply_act:
+            self.act = act_layer(inplace=inplace)
+        else:
+            self.act = None
+
+    def forward(self, x):
+        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        if self.act is not None:
+            x = self.act(x)
+        return x
--- a/timm/models/layers/pool2d_same.py
+++ b/timm/models/layers/pool2d_same.py
@ -5,8 +5,7 @@ Hacked together by Ross Wightman
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from typing import Union, List, Tuple, Optional
-import math
+from typing import List, Tuple, Optional

 from .helpers import tup_pair
 from .padding import pad_same, get_padding_value
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@ -1,9 +1,11 @@
 from torch import nn as nn
+from .create_act import get_act_fn


 class SEModule(nn.Module):

-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None):
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None,
+                 gate_fn='sigmoid'):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        reduction_channels = reduction_channels or max(channels // reduction, min_channels)
@ -12,10 +14,27 @@ class SEModule(nn.Module):
        self.act = act_layer(inplace=True)
        self.fc2 = nn.Conv2d(
            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
+        self.gate_fn = get_act_fn(gate_fn)

    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.fc1(x_se)
        x_se = self.act(x_se)
        x_se = self.fc2(x_se)
-        return x * x_se.sigmoid()
+        return x * self.gate_fn(x_se)
+
+
+class EffectiveSEModule(nn.Module):
+    """ 'Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+    def __init__(self, channel, gate_fn='hard_sigmoid'):
+        super(EffectiveSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.gate_fn = get_act_fn(gate_fn)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.fc(x_se)
+        return x * self.gate_fn(x_se, inplace=True)
--- a/timm/models/layers/selective_kernel.py
+++ b/timm/models/layers/selective_kernel.py
@ -4,7 +4,6 @@ Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)

 Hacked together by Ross Wightman
 """
-
 import torch
 from torch import nn as nn

--- a/timm/models/layers/separable_conv.py
+++ b/timm/models/layers/separable_conv.py
@ -0,0 +1,51 @@
+from torch import nn as nn
+
+from .create_conv2d import create_conv2d
+from .create_norm_act import convert_norm_act_type
+
+
+class SeparableConvBnAct(nn.Module):
+    """ Separable Conv w/ trailing Norm and Activation
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
+                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 act_layer=nn.ReLU, apply_act=True, drop_block=None):
+        super(SeparableConvBnAct, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+
+        self.conv_dw = create_conv2d(
+            in_channels, int(in_channels * channel_multiplier), kernel_size,
+            stride=stride, dilation=dilation, padding=padding, depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
+
+        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+    """ Separable Conv
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
+                 channel_multiplier=1.0, pw_kernel_size=1):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv_dw = create_conv2d(
+            in_channels, int(in_channels * channel_multiplier), kernel_size,
+            stride=stride, dilation=dilation, padding=padding, depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        return x
--- a/timm/models/layers/test_time_pool.py
+++ b/timm/models/layers/test_time_pool.py
@ -6,6 +6,7 @@ Hacked together by Ross Wightman
 import logging
 from torch import nn
 import torch.nn.functional as F
+
 from .adaptive_avgmax_pool import adaptive_avgmax_pool2d


--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -7,14 +7,19 @@ Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244

 Hacked together by Ross Wightman
 """
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List

-from .efficientnet_builder import *
-from .registry import register_model
-from .helpers import load_pretrained
-from .layers import SelectAdaptivePool2d, create_conv2d
-from .layers.activations import HardSwish, hard_sigmoid
-from .feature_hooks import FeatureHooks
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
+from .feature_hooks import FeatureHooks
+from .helpers import load_pretrained
+from .layers import SelectAdaptivePool2d, create_conv2d, get_act_fn, hard_sigmoid
+from .registry import register_model

 __all__ = ['MobileNetV3']

@ -76,7 +81,7 @@ class MobileNetV3(nn.Module):
                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(MobileNetV3, self).__init__()
-        
+
        self.num_classes = num_classes
        self.num_features = num_features
        self.drop_rate = drop_rate
@ -96,7 +101,7 @@ class MobileNetV3(nn.Module):
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs
-        
+
        # Head + Pooling
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.conv_head = create_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
@ -120,8 +125,11 @@ class MobileNetV3(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_classes = num_classes
-        self.classifier = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if self.num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.classifier = nn.Linear(num_features, num_classes)
+        else:
+            self.classifier = nn.Identity()

    def forward_features(self, x):
        x = self.conv_stem(x)
@ -201,7 +209,16 @@ class MobileNetV3Features(nn.Module):
            return self._feature_info[idx]['num_chs']
        return [self._feature_info[i]['num_chs'] for i in self.out_indices]

-    def forward(self, x):
+    def feature_info(self, idx=None):
+        """ Feature Channel Shortcut
+        Returns feature channel count for each output index if idx == None. If idx is an integer, will
+        return feature channel count for that feature block index (independent of out_indices setting).
+        """
+        if isinstance(idx, int):
+            return self._feature_info[idx]
+        return [self._feature_info[i] for i in self.out_indices]
+
+    def forward(self, x) -> List[torch.Tensor]:
        x = self.conv_stem(x)
        x = self.bn1(x)
        x = self.act1(x)
@ -270,8 +287,8 @@ def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kw
        head_bias=False,
        channel_multiplier=channel_multiplier,
        norm_kwargs=resolve_bn_args(kwargs),
-        act_layer=HardSwish,
-        se_kwargs=dict(gate_fn=hard_sigmoid, reduce_mid=True, divisor=1),
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+        se_kwargs=dict(gate_fn=get_act_fn('hard_sigmoid'), reduce_mid=True, divisor=1),
        **kwargs,
    )
    model = _create_model(model_kwargs, default_cfgs[variant], pretrained)
@ -290,7 +307,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
    if 'small' in variant:
        num_features = 1024
        if 'minimal' in variant:
-            act_layer = nn.ReLU
+            act_layer = resolve_act_layer(kwargs, 'relu')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s2_e1_c16'],
@ -306,7 +323,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
                ['cn_r1_k1_s1_c576'],
            ]
        else:
-            act_layer = HardSwish
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
@ -324,7 +341,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
    else:
        num_features = 1280
        if 'minimal' in variant:
-            act_layer = nn.ReLU
+            act_layer = resolve_act_layer(kwargs, 'relu')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s1_e1_c16'],
@ -342,7 +359,7 @@ def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwarg
                ['cn_r1_k1_s1_c960'],
            ]
        else:
-            act_layer = HardSwish
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
            arch_def = [
                # stage 0, 112x112 in
                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
@ -397,7 +414,6 @@ def mobilenetv3_small_075(pretrained=False, **kwargs):

@register_model
 def mobilenetv3_small_100(pretrained=False, **kwargs):
-    print(kwargs)
    """ MobileNet V3 """
    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
    return model
--- a/timm/models/nasnet.py
+++ b/timm/models/nasnet.py
@ -2,10 +2,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-
+from .registry import register_model

 __all__ = ['NASNetALarge']

@ -187,17 +186,17 @@ class CellStem1(nn.Module):
        self.stem_size = stem_size
        self.conv_1x1 = nn.Sequential()
        self.conv_1x1.add_module('relu', nn.ReLU())
-        self.conv_1x1.add_module('conv', nn.Conv2d(2*self.num_channels, self.num_channels, 1, stride=1, bias=False))
+        self.conv_1x1.add_module('conv', nn.Conv2d(2 * self.num_channels, self.num_channels, 1, stride=1, bias=False))
        self.conv_1x1.add_module('bn', nn.BatchNorm2d(self.num_channels, eps=0.001, momentum=0.1, affine=True))

        self.relu = nn.ReLU()
        self.path_1 = nn.Sequential()
        self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_1.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels//2, 1, stride=1, bias=False))
+        self.path_1.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))
        self.path_2 = nn.ModuleList()
        self.path_2.add_module('pad', nn.ZeroPad2d((0, 1, 0, 1)))
        self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False))
-        self.path_2.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels//2, 1, stride=1, bias=False))
+        self.path_2.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False))

        self.final_path_bn = nn.BatchNorm2d(self.num_channels, eps=0.001, momentum=0.1, affine=True)

@ -507,50 +506,50 @@ class NASNetALarge(nn.Module):
        self.cell_stem_0 = CellStem0(self.stem_size, num_channels=channels // (channel_multiplier ** 2))
        self.cell_stem_1 = CellStem1(self.stem_size, num_channels=channels // channel_multiplier)

-        self.cell_0 = FirstCell(in_channels_left=channels, out_channels_left=channels//2,
-                                in_channels_right=2*channels, out_channels_right=channels)
-        self.cell_1 = NormalCell(in_channels_left=2*channels, out_channels_left=channels,
-                                 in_channels_right=6*channels, out_channels_right=channels)
-        self.cell_2 = NormalCell(in_channels_left=6*channels, out_channels_left=channels,
-                                 in_channels_right=6*channels, out_channels_right=channels)
-        self.cell_3 = NormalCell(in_channels_left=6*channels, out_channels_left=channels,
-                                 in_channels_right=6*channels, out_channels_right=channels)
-        self.cell_4 = NormalCell(in_channels_left=6*channels, out_channels_left=channels,
-                                 in_channels_right=6*channels, out_channels_right=channels)
-        self.cell_5 = NormalCell(in_channels_left=6*channels, out_channels_left=channels,
-                                 in_channels_right=6*channels, out_channels_right=channels)
-
-        self.reduction_cell_0 = ReductionCell0(in_channels_left=6*channels, out_channels_left=2*channels,
-                                               in_channels_right=6*channels, out_channels_right=2*channels)
-
-        self.cell_6 = FirstCell(in_channels_left=6*channels, out_channels_left=channels,
-                                in_channels_right=8*channels, out_channels_right=2*channels)
-        self.cell_7 = NormalCell(in_channels_left=8*channels, out_channels_left=2*channels,
-                                 in_channels_right=12*channels, out_channels_right=2*channels)
-        self.cell_8 = NormalCell(in_channels_left=12*channels, out_channels_left=2*channels,
-                                 in_channels_right=12*channels, out_channels_right=2*channels)
-        self.cell_9 = NormalCell(in_channels_left=12*channels, out_channels_left=2*channels,
-                                 in_channels_right=12*channels, out_channels_right=2*channels)
-        self.cell_10 = NormalCell(in_channels_left=12*channels, out_channels_left=2*channels,
-                                  in_channels_right=12*channels, out_channels_right=2*channels)
-        self.cell_11 = NormalCell(in_channels_left=12*channels, out_channels_left=2*channels,
-                                  in_channels_right=12*channels, out_channels_right=2*channels)
-
-        self.reduction_cell_1 = ReductionCell1(in_channels_left=12*channels, out_channels_left=4*channels,
-                                               in_channels_right=12*channels, out_channels_right=4*channels)
-
-        self.cell_12 = FirstCell(in_channels_left=12*channels, out_channels_left=2*channels,
-                                 in_channels_right=16*channels, out_channels_right=4*channels)
-        self.cell_13 = NormalCell(in_channels_left=16*channels, out_channels_left=4*channels,
-                                  in_channels_right=24*channels, out_channels_right=4*channels)
-        self.cell_14 = NormalCell(in_channels_left=24*channels, out_channels_left=4*channels,
-                                  in_channels_right=24*channels, out_channels_right=4*channels)
-        self.cell_15 = NormalCell(in_channels_left=24*channels, out_channels_left=4*channels,
-                                  in_channels_right=24*channels, out_channels_right=4*channels)
-        self.cell_16 = NormalCell(in_channels_left=24*channels, out_channels_left=4*channels,
-                                  in_channels_right=24*channels, out_channels_right=4*channels)
-        self.cell_17 = NormalCell(in_channels_left=24*channels, out_channels_left=4*channels,
-                                  in_channels_right=24*channels, out_channels_right=4*channels)
+        self.cell_0 = FirstCell(in_channels_left=channels, out_channels_left=channels // 2,
+                                in_channels_right=2 * channels, out_channels_right=channels)
+        self.cell_1 = NormalCell(in_channels_left=2 * channels, out_channels_left=channels,
+                                 in_channels_right=6 * channels, out_channels_right=channels)
+        self.cell_2 = NormalCell(in_channels_left=6 * channels, out_channels_left=channels,
+                                 in_channels_right=6 * channels, out_channels_right=channels)
+        self.cell_3 = NormalCell(in_channels_left=6 * channels, out_channels_left=channels,
+                                 in_channels_right=6 * channels, out_channels_right=channels)
+        self.cell_4 = NormalCell(in_channels_left=6 * channels, out_channels_left=channels,
+                                 in_channels_right=6 * channels, out_channels_right=channels)
+        self.cell_5 = NormalCell(in_channels_left=6 * channels, out_channels_left=channels,
+                                 in_channels_right=6 * channels, out_channels_right=channels)
+
+        self.reduction_cell_0 = ReductionCell0(in_channels_left=6 * channels, out_channels_left=2 * channels,
+                                               in_channels_right=6 * channels, out_channels_right=2 * channels)
+
+        self.cell_6 = FirstCell(in_channels_left=6 * channels, out_channels_left=channels,
+                                in_channels_right=8 * channels, out_channels_right=2 * channels)
+        self.cell_7 = NormalCell(in_channels_left=8 * channels, out_channels_left=2 * channels,
+                                 in_channels_right=12 * channels, out_channels_right=2 * channels)
+        self.cell_8 = NormalCell(in_channels_left=12 * channels, out_channels_left=2 * channels,
+                                 in_channels_right=12 * channels, out_channels_right=2 * channels)
+        self.cell_9 = NormalCell(in_channels_left=12 * channels, out_channels_left=2 * channels,
+                                 in_channels_right=12 * channels, out_channels_right=2 * channels)
+        self.cell_10 = NormalCell(in_channels_left=12 * channels, out_channels_left=2 * channels,
+                                  in_channels_right=12 * channels, out_channels_right=2 * channels)
+        self.cell_11 = NormalCell(in_channels_left=12 * channels, out_channels_left=2 * channels,
+                                  in_channels_right=12 * channels, out_channels_right=2 * channels)
+
+        self.reduction_cell_1 = ReductionCell1(in_channels_left=12 * channels, out_channels_left=4 * channels,
+                                               in_channels_right=12 * channels, out_channels_right=4 * channels)
+
+        self.cell_12 = FirstCell(in_channels_left=12 * channels, out_channels_left=2 * channels,
+                                 in_channels_right=16 * channels, out_channels_right=4 * channels)
+        self.cell_13 = NormalCell(in_channels_left=16 * channels, out_channels_left=4 * channels,
+                                  in_channels_right=24 * channels, out_channels_right=4 * channels)
+        self.cell_14 = NormalCell(in_channels_left=24 * channels, out_channels_left=4 * channels,
+                                  in_channels_right=24 * channels, out_channels_right=4 * channels)
+        self.cell_15 = NormalCell(in_channels_left=24 * channels, out_channels_left=4 * channels,
+                                  in_channels_right=24 * channels, out_channels_right=4 * channels)
+        self.cell_16 = NormalCell(in_channels_left=24 * channels, out_channels_left=4 * channels,
+                                  in_channels_right=24 * channels, out_channels_right=4 * channels)
+        self.cell_17 = NormalCell(in_channels_left=24 * channels, out_channels_left=4 * channels,
+                                  in_channels_right=24 * channels, out_channels_right=4 * channels)

        self.relu = nn.ReLU()
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
@ -562,9 +561,11 @@ class NASNetALarge(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        del self.last_linear
-        self.last_linear = nn.Linear(
-            self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.last_linear = nn.Linear(num_features, num_classes)
+        else:
+            self.last_linear = nn.Identity()

    def forward_features(self, x):
        x_conv0 = self.conv0(x)
--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@ -6,15 +6,16 @@

 """
 from __future__ import print_function, division, absolute_import
+
 from collections import OrderedDict

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
+from .registry import register_model

 __all__ = ['PNASNet5Large']

@ -42,11 +43,12 @@ class MaxPool(nn.Module):
        self.pool = nn.MaxPool2d(kernel_size, stride=stride, padding=padding)

    def forward(self, x):
-        if self.zero_pad:
+        if self.zero_pad is not None:
            x = self.zero_pad(x)
-        x = self.pool(x)
-        if self.zero_pad:
+            x = self.pool(x)
            x = x[:, :, 1:, 1:]
+        else:
+            x = self.pool(x)
        return x


@ -89,11 +91,12 @@ class BranchSeparables(nn.Module):

    def forward(self, x):
        x = self.relu_1(x)
-        if self.zero_pad:
+        if self.zero_pad is not None:
            x = self.zero_pad(x)
-        x = self.separable_1(x)
-        if self.zero_pad:
+            x = self.separable_1(x)
            x = x[:, :, 1:, 1:].contiguous()
+        else:
+            x = self.separable_1(x)
        x = self.bn_sep_1(x)
        x = self.relu_2(x)
        x = self.separable_2(x)
@ -170,15 +173,14 @@ class CellBase(nn.Module):
        x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right

        x_comb_iter_4_left = self.comb_iter_4_left(x_left)
-        if self.comb_iter_4_right:
+        if self.comb_iter_4_right is not None:
            x_comb_iter_4_right = self.comb_iter_4_right(x_right)
        else:
            x_comb_iter_4_right = x_right
        x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right

        x_out = torch.cat(
-            [x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3,
-             x_comb_iter_4], 1)
+            [x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1)
        return x_out


@ -279,9 +281,8 @@ class Cell(CellBase):
                                                 kernel_size=3, stride=stride,
                                                 zero_pad=zero_pad)
        if is_reduction:
-            self.comb_iter_4_right = ReluConvBn(out_channels_right,
-                                                out_channels_right,
-                                                kernel_size=1, stride=stride)
+            self.comb_iter_4_right = ReluConvBn(
+                out_channels_right, out_channels_right, kernel_size=1, stride=stride)
        else:
            self.comb_iter_4_right = None

@ -349,11 +350,11 @@ class PNASNet5Large(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        del self.last_linear
        if num_classes:
-            self.last_linear = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.last_linear = nn.Linear(num_features, num_classes)
        else:
-            self.last_linear = None
+            self.last_linear = nn.Identity()

    def forward_features(self, x):
        x_conv_0 = self.conv_0(x)
--- a/timm/models/regnet.py
+++ b/timm/models/regnet.py
@ -31,30 +31,30 @@ def _mcfg(**kwargs):

 # Model FLOPS = three trailing digits * 10^8
 model_cfgs = dict(
-    x_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13),
-    x_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22),
-    x_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16),
-    x_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16),
-    x_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18),
-    x_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25),
-    x_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23),
-    x_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17),
-    x_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23),
-    x_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19),
-    x_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22),
-    x_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23),
-    y_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25),
-    y_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25),
-    y_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25),
-    y_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25),
-    y_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25),
-    y_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25),
-    y_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25),
-    y_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25),
-    y_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25),
-    y_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25),
-    y_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25),
-    y_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25),
+    regnetx_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13),
+    regnetx_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22),
+    regnetx_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16),
+    regnetx_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16),
+    regnetx_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18),
+    regnetx_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25),
+    regnetx_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23),
+    regnetx_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17),
+    regnetx_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23),
+    regnetx_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19),
+    regnetx_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22),
+    regnetx_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23),
+    regnety_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25),
+    regnety_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25),
+    regnety_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25),
+    regnety_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25),
+    regnety_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25),
+    regnety_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25),
+    regnety_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25),
+    regnety_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25),
+    regnety_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25),
+    regnety_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25),
+    regnety_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25),
+    regnety_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25),
 )


@ -68,30 +68,30 @@ def _cfg(url=''):


 default_cfgs = dict(
-    x_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_002-e7e85e5c.pth'),
-    x_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_004-7d0e9424.pth'),
-    x_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_006-85ec1baa.pth'),
-    x_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_008-d8b470eb.pth'),
-    x_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_016-65ca972a.pth'),
-    x_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_032-ed0c7f7e.pth'),
-    x_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_040-73c2a654.pth'),
-    x_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_064-29278baa.pth'),
-    x_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_080-7c7fcab1.pth'),
-    x_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'),
-    x_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'),
-    x_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'),
-    y_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'),
-    y_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'),
-    y_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'),
-    y_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_008-dc900dbe.pth'),
-    y_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_016-54367f74.pth'),
-    y_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_032-62b47782.pth'),
-    y_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth'),
-    y_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_064-0a48325c.pth'),
-    y_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_080-e7f3eb93.pth'),
-    y_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_120-721ba79a.pth'),
-    y_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_160-d64013cd.pth'),
-    y_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'),
+    regnetx_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_002-e7e85e5c.pth'),
+    regnetx_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_004-7d0e9424.pth'),
+    regnetx_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_006-85ec1baa.pth'),
+    regnetx_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_008-d8b470eb.pth'),
+    regnetx_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_016-65ca972a.pth'),
+    regnetx_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_032-ed0c7f7e.pth'),
+    regnetx_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_040-73c2a654.pth'),
+    regnetx_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_064-29278baa.pth'),
+    regnetx_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_080-7c7fcab1.pth'),
+    regnetx_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'),
+    regnetx_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'),
+    regnetx_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'),
+    regnety_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'),
+    regnety_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'),
+    regnety_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'),
+    regnety_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_008-dc900dbe.pth'),
+    regnety_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_016-54367f74.pth'),
+    regnety_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_032-62b47782.pth'),
+    regnety_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth'),
+    regnety_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_064-0a48325c.pth'),
+    regnety_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_080-e7f3eb93.pth'),
+    regnety_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_120-721ba79a.pth'),
+    regnety_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_160-d64013cd.pth'),
+    regnety_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'),
 )


@ -344,142 +344,142 @@ def _regnet(variant, pretrained, **kwargs):
@register_model
 def regnetx_002(pretrained=False, **kwargs):
    """RegNetX-200MF"""
-    return _regnet('x_002', pretrained, **kwargs)
+    return _regnet('regnetx_002', pretrained, **kwargs)


@register_model
 def regnetx_004(pretrained=False, **kwargs):
    """RegNetX-400MF"""
-    return _regnet('x_004', pretrained, **kwargs)
+    return _regnet('regnetx_004', pretrained, **kwargs)


@register_model
 def regnetx_006(pretrained=False, **kwargs):
    """RegNetX-600MF"""
-    return _regnet('x_006', pretrained, **kwargs)
+    return _regnet('regnetx_006', pretrained, **kwargs)


@register_model
 def regnetx_008(pretrained=False, **kwargs):
    """RegNetX-800MF"""
-    return _regnet('x_008', pretrained, **kwargs)
+    return _regnet('regnetx_008', pretrained, **kwargs)


@register_model
 def regnetx_016(pretrained=False, **kwargs):
    """RegNetX-1.6GF"""
-    return _regnet('x_016', pretrained, **kwargs)
+    return _regnet('regnetx_016', pretrained, **kwargs)


@register_model
 def regnetx_032(pretrained=False, **kwargs):
    """RegNetX-3.2GF"""
-    return _regnet('x_032', pretrained, **kwargs)
+    return _regnet('regnetx_032', pretrained, **kwargs)


@register_model
 def regnetx_040(pretrained=False, **kwargs):
    """RegNetX-4.0GF"""
-    return _regnet('x_040', pretrained, **kwargs)
+    return _regnet('regnetx_040', pretrained, **kwargs)


@register_model
 def regnetx_064(pretrained=False, **kwargs):
    """RegNetX-6.4GF"""
-    return _regnet('x_064', pretrained, **kwargs)
+    return _regnet('regnetx_064', pretrained, **kwargs)


@register_model
 def regnetx_080(pretrained=False, **kwargs):
    """RegNetX-8.0GF"""
-    return _regnet('x_080', pretrained, **kwargs)
+    return _regnet('regnetx_080', pretrained, **kwargs)


@register_model
 def regnetx_120(pretrained=False, **kwargs):
    """RegNetX-12GF"""
-    return _regnet('x_120', pretrained, **kwargs)
+    return _regnet('regnetx_120', pretrained, **kwargs)


@register_model
 def regnetx_160(pretrained=False, **kwargs):
    """RegNetX-16GF"""
-    return _regnet('x_160', pretrained, **kwargs)
+    return _regnet('regnetx_160', pretrained, **kwargs)


@register_model
 def regnetx_320(pretrained=False, **kwargs):
    """RegNetX-32GF"""
-    return _regnet('x_320', pretrained, **kwargs)
+    return _regnet('regnetx_320', pretrained, **kwargs)


@register_model
 def regnety_002(pretrained=False, **kwargs):
    """RegNetY-200MF"""
-    return _regnet('y_002', pretrained, **kwargs)
+    return _regnet('regnety_002', pretrained, **kwargs)


@register_model
 def regnety_004(pretrained=False, **kwargs):
    """RegNetY-400MF"""
-    return _regnet('y_004', pretrained, **kwargs)
+    return _regnet('regnety_004', pretrained, **kwargs)


@register_model
 def regnety_006(pretrained=False, **kwargs):
    """RegNetY-600MF"""
-    return _regnet('y_006', pretrained, **kwargs)
+    return _regnet('regnety_006', pretrained, **kwargs)


@register_model
 def regnety_008(pretrained=False, **kwargs):
    """RegNetY-800MF"""
-    return _regnet('y_008', pretrained, **kwargs)
+    return _regnet('regnety_008', pretrained, **kwargs)


@register_model
 def regnety_016(pretrained=False, **kwargs):
    """RegNetY-1.6GF"""
-    return _regnet('y_016', pretrained, **kwargs)
+    return _regnet('regnety_016', pretrained, **kwargs)


@register_model
 def regnety_032(pretrained=False, **kwargs):
    """RegNetY-3.2GF"""
-    return _regnet('y_032', pretrained, **kwargs)
+    return _regnet('regnety_032', pretrained, **kwargs)


@register_model
 def regnety_040(pretrained=False, **kwargs):
    """RegNetY-4.0GF"""
-    return _regnet('y_040', pretrained, **kwargs)
+    return _regnet('regnety_040', pretrained, **kwargs)


@register_model
 def regnety_064(pretrained=False, **kwargs):
    """RegNetY-6.4GF"""
-    return _regnet('y_064', pretrained, **kwargs)
+    return _regnet('regnety_064', pretrained, **kwargs)


@register_model
 def regnety_080(pretrained=False, **kwargs):
    """RegNetY-8.0GF"""
-    return _regnet('y_080', pretrained, **kwargs)
+    return _regnet('regnety_080', pretrained, **kwargs)


@register_model
 def regnety_120(pretrained=False, **kwargs):
    """RegNetY-12GF"""
-    return _regnet('y_120', pretrained, **kwargs)
+    return _regnet('regnety_120', pretrained, **kwargs)


@register_model
 def regnety_160(pretrained=False, **kwargs):
    """RegNetY-16GF"""
-    return _regnet('y_160', pretrained, **kwargs)
+    return _regnet('regnety_160', pretrained, **kwargs)


@register_model
 def regnety_320(pretrained=False, **kwargs):
    """RegNetY-32GF"""
-    return _regnet('y_320', pretrained, **kwargs)
+    return _regnet('regnety_320', pretrained, **kwargs)
--- a/timm/models/res2net.py
+++ b/timm/models/res2net.py
@ -6,13 +6,11 @@ import math

 import torch
 import torch.nn as nn
-import torch.nn.functional as F

-from .resnet import ResNet
-from .registry import register_model
-from .helpers import load_pretrained
-from .layers import SEModule
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import load_pretrained
+from .registry import register_model
+from .resnet import ResNet

 __all__ = []

@ -79,6 +77,8 @@ class Bottle2neck(nn.Module):
        if self.is_first:
            # FIXME this should probably have count_include_pad=False, but hurts original weights
            self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1)
+        else:
+            self.pool = None

        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
@ -99,14 +99,22 @@ class Bottle2neck(nn.Module):

        spx = torch.split(out, self.width, 1)
        spo = []
+        sp = spx[0]  # redundant, for torchscript
        for i, (conv, bn) in enumerate(zip(self.convs, self.bns)):
-            sp = spx[i] if i == 0 or self.is_first else sp + spx[i]
+            if i == 0 or self.is_first:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
            sp = conv(sp)
            sp = bn(sp)
            sp = self.relu(sp)
            spo.append(sp)
-        if self.scale > 1 :
-            spo.append(self.pool(spx[-1]) if self.is_first else spx[-1])
+        if self.scale > 1:
+            if self.pool is not None:
+                # self.is_first == True, None check for torchscript
+                spo.append(self.pool(spx[-1]))
+            else:
+                spo.append(spx[-1])
        out = torch.cat(spo, 1)

        out = self.conv3(out)
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -10,10 +10,10 @@ import math
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained, adapt_model_from_file
 from .layers import SelectAdaptivePool2d, DropBlock2d, DropPath, AvgPool2dSame, create_attn, BlurPool2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model

 __all__ = ['ResNet', 'BasicBlock', 'Bottleneck']  # model_registry will add each entrypoint fn to this

@ -200,7 +200,6 @@ class BasicBlock(nn.Module):


 class Bottleneck(nn.Module):
-    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
@ -377,6 +376,7 @@ class ResNet(nn.Module):
    global_pool : str, default 'avg'
        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
    """
+
    def __init__(self, block, layers, num_classes=1000, in_chans=3,
                 cardinality=1, base_width=64, stem_width=64, stem_type='',
                 block_reduce_first=1, down_kernel_size=1, avg_down=False, output_stride=32,
@ -482,8 +482,11 @@ class ResNet(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_classes = num_classes
-        del self.fc
-        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.fc = nn.Linear(num_features, num_classes)
+        else:
+            self.fc = nn.Identity()

    def forward_features(self, x):
        x = self.conv1(x)
--- a/timm/models/selecsls.py
+++ b/timm/models/selecsls.py
@ -9,16 +9,16 @@ https://arxiv.org/abs/1907.00837
 Based on ResNet implementation in https://github.com/rwightman/pytorch-image-models
 and SelecSLS Net implementation in https://github.com/mehtadushy/SelecSLS-Pytorch
 """
-import math
+from typing import List

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model

 __all__ = ['SelecSLS']  # model_registry will add each entrypoint fn to this

@ -53,6 +53,27 @@ default_cfgs = {
 }


+class SequentialList(nn.Sequential):
+
+    def __init__(self, *args):
+        super(SequentialList, self).__init__(*args)
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (List[torch.Tensor]) -> (List[torch.Tensor])
+        pass
+
+    @torch.jit._overload_method  # noqa: F811
+    def forward(self, x):
+        # type: (torch.Tensor) -> (List[torch.Tensor])
+        pass
+
+    def forward(self, x) -> List[torch.Tensor]:
+        for module in self:
+            x = module(x)
+        return x
+
+
 def conv_bn(in_chs, out_chs, k=3, stride=1, padding=None, dilation=1):
    if padding is None:
        padding = ((stride - 1) + dilation * (k - 1)) // 2
@ -78,7 +99,7 @@ class SelecSLSBlock(nn.Module):
        self.conv5 = conv_bn(mid_chs, mid_chs // 2, 3)
        self.conv6 = conv_bn(2 * mid_chs + (0 if is_first else skip_chs), out_chs, 1)

-    def forward(self, x):
+    def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]:
        assert isinstance(x, list)
        assert len(x) in [1, 2]

@ -114,7 +135,7 @@ class SelecSLS(nn.Module):
        super(SelecSLS, self).__init__()

        self.stem = conv_bn(in_chans, 32, stride=2)
-        self.features = nn.Sequential(*[cfg['block'](*block_args) for block_args in cfg['features']])
+        self.features = SequentialList(*[cfg['block'](*block_args) for block_args in cfg['features']])
        self.head = nn.Sequential(*[conv_bn(*conv_args) for conv_args in cfg['head']])
        self.num_features = cfg['num_features']

@ -134,11 +155,11 @@ class SelecSLS(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_classes = num_classes
-        del self.fc
        if num_classes:
-            self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.fc = nn.Linear(num_features, num_classes)
        else:
-            self.fc = None
+            self.fc = nn.Identity()

    def forward_features(self, x):
        x = self.stem(x)
--- a/timm/models/senet.py
+++ b/timm/models/senet.py
@ -8,16 +8,16 @@ Original model: https://github.com/hujie-frank/SENet
 ResNet code gently borrowed from
 https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
 """
-from collections import OrderedDict
 import math
+from collections import OrderedDict

 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
-from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model

 __all__ = ['SENet']

@ -369,11 +369,11 @@ class SENet(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.avg_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        del self.last_linear
        if num_classes:
-            self.last_linear = nn.Linear(self.num_features * self.avg_pool.feat_mult(), num_classes)
+            num_features = self.num_features * self.avg_pool.feat_mult()
+            self.last_linear = nn.Linear(num_features, num_classes)
        else:
-            self.last_linear = None
+            self.last_linear = nn.Identity()

    def forward_features(self, x):
        x = self.layer0(x)
--- a/timm/models/tresnet.py
+++ b/timm/models/tresnet.py
@ -5,20 +5,16 @@ https://arxiv.org/pdf/2003.13630.pdf
 Original model: https://github.com/mrT23/TResNet

 """
+from collections import OrderedDict
 from functools import partial
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from collections import OrderedDict
-from .layers import SpaceToDepthModule, AntiAliasDownsampleLayer, SelectAdaptivePool2d
-from .registry import register_model
-from .helpers import load_pretrained

-try:
-    from inplace_abn import InPlaceABN
-    has_iabn = True
-except ImportError:
-    has_iabn = False
+from .helpers import load_pretrained
+from .layers import SpaceToDepthModule, AntiAliasDownsampleLayer, SelectAdaptivePool2d, InplaceAbn
+from .registry import register_model

 __all__ = ['tresnet_m', 'tresnet_l', 'tresnet_xl']

@ -88,38 +84,38 @@ class FastSEModule(nn.Module):


 def IABN2Float(module: nn.Module) -> nn.Module:
-    "If `module` is IABN don't use half precision."
-    if isinstance(module, InPlaceABN):
+    """If `module` is IABN don't use half precision."""
+    if isinstance(module, InplaceAbn):
        module.float()
    for child in module.children():
        IABN2Float(child)
    return module


-def conv2d_ABN(ni, nf, stride, activation="leaky_relu", kernel_size=3, activation_param=1e-2, groups=1):
+def conv2d_iabn(ni, nf, stride, kernel_size=3, groups=1, act_layer="leaky_relu", act_param=1e-2):
    return nn.Sequential(
        nn.Conv2d(
            ni, nf, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=groups, bias=False),
-        InPlaceABN(num_features=nf, activation=activation, activation_param=activation_param)
+        InplaceAbn(nf, act_layer=act_layer, act_param=act_param)
    )


 class BasicBlock(nn.Module):
    expansion = 1

-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, anti_alias_layer=None):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, aa_layer=None):
        super(BasicBlock, self).__init__()
        if stride == 1:
-            self.conv1 = conv2d_ABN(inplanes, planes, stride=1, activation_param=1e-3)
+            self.conv1 = conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3)
        else:
-            if anti_alias_layer is None:
-                self.conv1 = conv2d_ABN(inplanes, planes, stride=2, activation_param=1e-3)
+            if aa_layer is None:
+                self.conv1 = conv2d_iabn(inplanes, planes, stride=2, act_param=1e-3)
            else:
                self.conv1 = nn.Sequential(
-                    conv2d_ABN(inplanes, planes, stride=1, activation_param=1e-3),
-                    anti_alias_layer(channels=planes, filt_size=3, stride=2))
+                    conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3),
+                    aa_layer(channels=planes, filt_size=3, stride=2))

-        self.conv2 = conv2d_ABN(planes, planes, stride=1, activation="identity")
+        self.conv2 = conv2d_iabn(planes, planes, stride=1, act_layer="identity")
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
@ -146,24 +142,25 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
    expansion = 4

-    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, anti_alias_layer=None):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True,
+                 act_layer="leaky_relu", aa_layer=None):
        super(Bottleneck, self).__init__()
-        self.conv1 = conv2d_ABN(
-            inplanes, planes, kernel_size=1, stride=1, activation="leaky_relu", activation_param=1e-3)
+        self.conv1 = conv2d_iabn(
+            inplanes, planes, kernel_size=1, stride=1, act_layer=act_layer, act_param=1e-3)
        if stride == 1:
-            self.conv2 = conv2d_ABN(
-                planes, planes, kernel_size=3, stride=1, activation="leaky_relu", activation_param=1e-3)
+            self.conv2 = conv2d_iabn(
+                planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3)
        else:
-            if anti_alias_layer is None:
-                self.conv2 = conv2d_ABN(
-                    planes, planes, kernel_size=3, stride=2, activation="leaky_relu", activation_param=1e-3)
+            if aa_layer is None:
+                self.conv2 = conv2d_iabn(
+                    planes, planes, kernel_size=3, stride=2, act_layer=act_layer, act_param=1e-3)
            else:
                self.conv2 = nn.Sequential(
-                    conv2d_ABN(planes, planes, kernel_size=3, stride=1, activation="leaky_relu", activation_param=1e-3),
-                    anti_alias_layer(channels=planes, filt_size=3, stride=2))
+                    conv2d_iabn(planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3),
+                    aa_layer(channels=planes, filt_size=3, stride=2))

-        self.conv3 = conv2d_ABN(
-            planes, planes * self.expansion, kernel_size=1, stride=1, activation="identity")
+        self.conv3 = conv2d_iabn(
+            planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity")

        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
@ -193,30 +190,26 @@ class Bottleneck(nn.Module):
 class TResNet(nn.Module):
    def __init__(self, layers, in_chans=3, num_classes=1000, width_factor=1.0, no_aa_jit=False,
                 global_pool='avg', drop_rate=0.):
-        if not has_iabn:
-            raise ImportError(
-                "For TResNet models, please install InplaceABN: "
-                "'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
        self.num_classes = num_classes
        self.drop_rate = drop_rate
        super(TResNet, self).__init__()

        # JIT layers
        space_to_depth = SpaceToDepthModule()
-        anti_alias_layer = partial(AntiAliasDownsampleLayer, no_jit=no_aa_jit)
+        aa_layer = partial(AntiAliasDownsampleLayer, no_jit=no_aa_jit)

        # TResnet stages
        self.inplanes = int(64 * width_factor)
        self.planes = int(64 * width_factor)
-        conv1 = conv2d_ABN(in_chans * 16, self.planes, stride=1, kernel_size=3)
+        conv1 = conv2d_iabn(in_chans * 16, self.planes, stride=1, kernel_size=3)
        layer1 = self._make_layer(
-            BasicBlock, self.planes, layers[0], stride=1, use_se=True, anti_alias_layer=anti_alias_layer)  # 56x56
+            BasicBlock, self.planes, layers[0], stride=1, use_se=True, aa_layer=aa_layer)  # 56x56
        layer2 = self._make_layer(
-            BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, anti_alias_layer=anti_alias_layer)  # 28x28
+            BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, aa_layer=aa_layer)  # 28x28
        layer3 = self._make_layer(
-            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, anti_alias_layer=anti_alias_layer)  # 14x14
+            Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, aa_layer=aa_layer)  # 14x14
        layer4 = self._make_layer(
-            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, anti_alias_layer=anti_alias_layer)  # 7x7
+            Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, aa_layer=aa_layer)  # 7x7

        # body
        self.body = nn.Sequential(OrderedDict([
@ -237,7 +230,7 @@ class TResNet(nn.Module):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu')
-            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InPlaceABN):
+            elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InplaceAbn):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

@ -249,24 +242,24 @@ class TResNet(nn.Module):
                m.conv3[1].weight = nn.Parameter(torch.zeros_like(m.conv3[1].weight))  # BN to zero
            if isinstance(m, nn.Linear): m.weight.data.normal_(0, 0.01)

-    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, anti_alias_layer=None):
+    def _make_layer(self, block, planes, blocks, stride=1, use_se=True, aa_layer=None):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            layers = []
            if stride == 2:
                # avg pooling before 1x1 conv
                layers.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True, count_include_pad=False))
-            layers += [conv2d_ABN(
-                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, activation="identity")]
+            layers += [conv2d_iabn(
+                self.inplanes, planes * block.expansion, kernel_size=1, stride=1, act_layer="identity")]
            downsample = nn.Sequential(*layers)

        layers = []
        layers.append(block(
-            self.inplanes, planes, stride, downsample, use_se=use_se, anti_alias_layer=anti_alias_layer))
+            self.inplanes, planes, stride, downsample, use_se=use_se, aa_layer=aa_layer))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(
-                block(self.inplanes, planes, use_se=use_se, anti_alias_layer=anti_alias_layer))
+                block(self.inplanes, planes, use_se=use_se, aa_layer=aa_layer))
        return nn.Sequential(*layers)

    def get_classifier(self):
@ -277,8 +270,10 @@ class TResNet(nn.Module):
        self.num_classes = num_classes
        self.head = None
        if num_classes:
-            self.head = nn.Sequential(OrderedDict([
-                ('fc', nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes))]))
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.head = nn.Sequential(OrderedDict([('fc', nn.Linear(num_features, num_classes))]))
+        else:
+            self.head = nn.Sequential(OrderedDict([('fc', nn.Identity())]))

    def forward_features(self, x):
        return self.body(x)
--- a/timm/models/vovnet.py
+++ b/timm/models/vovnet.py
@ -0,0 +1,414 @@
+""" VoVNet (V1 & V2)
+
+Papers:
+* `An Energy and GPU-Computation Efficient Backbone Network` - https://arxiv.org/abs/1904.09730
+* `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+
+Looked at  https://github.com/youngwanLEE/vovnet-detectron2 &
+https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
+for some reference, rewrote most of the code.
+
+Hacked together by Ross Wightman
+"""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .registry import register_model
+from .helpers import load_pretrained
+from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, SelectAdaptivePool2d, \
+    create_attn, create_norm_act, get_norm_act_layer
+
+
+# model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 &
+# https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py
+model_cfgs = dict(
+    vovnet39a=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=False,
+        depthwise=False,
+        attn='',
+    ),
+    vovnet57a=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 4, 3],
+        residual=False,
+        depthwise=False,
+        attn='',
+
+    ),
+    ese_vovnet19b_slim_dw=dict(
+        stem_chs=[64, 64, 64],
+        stage_conv_chs=[64, 80, 96, 112],
+        stage_out_chs=[112, 256, 384, 512],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=True,
+        attn='ese',
+
+    ),
+    ese_vovnet19b_dw=dict(
+        stem_chs=[64, 64, 64],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=True,
+        attn='ese',
+    ),
+    ese_vovnet19b_slim=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[64, 80, 96, 112],
+        stage_out_chs=[112, 256, 384, 512],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    ese_vovnet19b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=3,
+        block_per_stage=[1, 1, 1, 1],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+
+    ),
+    ese_vovnet39b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    ese_vovnet57b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 4, 3],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+
+    ),
+    ese_vovnet99b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 3, 9, 3],
+        residual=True,
+        depthwise=False,
+        attn='ese',
+    ),
+    eca_vovnet39b=dict(
+        stem_chs=[64, 64, 128],
+        stage_conv_chs=[128, 160, 192, 224],
+        stage_out_chs=[256, 512, 768, 1024],
+        layer_per_block=5,
+        block_per_stage=[1, 1, 2, 2],
+        residual=True,
+        depthwise=False,
+        attn='eca',
+    ),
+)
+model_cfgs['ese_vovnet39b_evos'] = model_cfgs['ese_vovnet39b']
+model_cfgs['ese_vovnet99b_iabn'] = model_cfgs['ese_vovnet99b']
+
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0.conv', 'classifier': 'head.fc',
+    }
+
+
+default_cfgs = dict(
+    vovnet39a=_cfg(url=''),
+    vovnet57a=_cfg(url=''),
+    ese_vovnet19b_slim_dw=_cfg(url=''),
+    ese_vovnet19b_dw=_cfg(url=''),
+    ese_vovnet19b_slim=_cfg(url=''),
+    ese_vovnet39b=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet39b-f912fe73.pth'),
+    ese_vovnet57b=_cfg(url=''),
+    ese_vovnet99b=_cfg(url=''),
+    eca_vovnet39b=_cfg(url=''),
+    ese_vovnet39b_evos=_cfg(url=''),
+    ese_vovnet99b_iabn=_cfg(url=''),
+)
+
+
+class SequentialAppendList(nn.Sequential):
+    def __init__(self, *args):
+        super(SequentialAppendList, self).__init__(*args)
+
+    def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor:
+        for i, module in enumerate(self):
+            if i == 0:
+                concat_list.append(module(x))
+            else:
+                concat_list.append(module(concat_list[-1]))
+        x = torch.cat(concat_list, dim=1)
+        return x
+
+
+class OsaBlock(nn.Module):
+
+    def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
+                 depthwise=False, attn='', norm_layer=BatchNormAct2d):
+        super(OsaBlock, self).__init__()
+
+        self.residual = residual
+        self.depthwise = depthwise
+
+        next_in_chs = in_chs
+        if self.depthwise and next_in_chs != mid_chs:
+            assert not residual
+            self.conv_reduction = ConvBnAct(next_in_chs, mid_chs, 1, norm_layer=norm_layer)
+        else:
+            self.conv_reduction = None
+
+        mid_convs = []
+        for i in range(layer_per_block):
+            if self.depthwise:
+                conv = SeparableConvBnAct(mid_chs, mid_chs, norm_layer=norm_layer)
+            else:
+                conv = ConvBnAct(next_in_chs, mid_chs, 3, norm_layer=norm_layer)
+            next_in_chs = mid_chs
+            mid_convs.append(conv)
+        self.conv_mid = SequentialAppendList(*mid_convs)
+
+        # feature aggregation
+        next_in_chs = in_chs + layer_per_block * mid_chs
+        self.conv_concat = ConvBnAct(next_in_chs, out_chs, norm_layer=norm_layer)
+
+        if attn:
+            self.attn = create_attn(attn, out_chs)
+        else:
+            self.attn = None
+
+    def forward(self, x):
+        output = [x]
+        if self.conv_reduction is not None:
+            x = self.conv_reduction(x)
+        x = self.conv_mid(x, output)
+        x = self.conv_concat(x)
+        if self.attn is not None:
+            x = self.attn(x)
+        if self.residual:
+            x = x + output[0]
+        return x
+
+
+class OsaStage(nn.Module):
+
+    def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block,
+                 downsample=True, residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d):
+        super(OsaStage, self).__init__()
+
+        if downsample:
+            self.pool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)
+        else:
+            self.pool = None
+
+        blocks = []
+        for i in range(block_per_stage):
+            last_block = i == block_per_stage - 1
+            blocks += [OsaBlock(
+                in_chs if i == 0 else out_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0,
+                depthwise=depthwise, attn=attn if last_block else '', norm_layer=norm_layer)
+            ]
+        self.blocks = nn.Sequential(*blocks)
+
+    def forward(self, x):
+        if self.pool is not None:
+            x = self.pool(x)
+        x = self.blocks(x)
+        return x
+
+
+class ClassifierHead(nn.Module):
+    """Head."""
+
+    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.):
+        super(ClassifierHead, self).__init__()
+        self.drop_rate = drop_rate
+        self.global_pool = SelectAdaptivePool2d(pool_type=pool_type)
+        if num_classes > 0:
+            self.fc = nn.Linear(in_chs, num_classes, bias=True)
+        else:
+            self.fc = nn.Identity()
+
+    def forward(self, x):
+        x = self.global_pool(x).flatten(1)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
+        x = self.fc(x)
+        return x
+
+
+class VovNet(nn.Module):
+
+    def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4,
+                 norm_layer=BatchNormAct2d):
+        """ VovNet (v2)
+        """
+        super(VovNet, self).__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert stem_stride in (4, 2)
+
+        stem_chs = cfg["stem_chs"]
+        stage_conv_chs = cfg["stage_conv_chs"]
+        stage_out_chs = cfg["stage_out_chs"]
+        block_per_stage = cfg["block_per_stage"]
+        layer_per_block = cfg["layer_per_block"]
+
+        # Stem module
+        last_stem_stride = stem_stride // 2
+        conv_type = SeparableConvBnAct if cfg["depthwise"] else ConvBnAct
+        self.stem = nn.Sequential(*[
+            ConvBnAct(in_chans, stem_chs[0], 3, stride=2, norm_layer=norm_layer),
+            conv_type(stem_chs[0], stem_chs[1], 3, stride=1, norm_layer=norm_layer),
+            conv_type(stem_chs[1], stem_chs[2], 3, stride=last_stem_stride, norm_layer=norm_layer),
+        ])
+
+        # OSA stages
+        in_ch_list = stem_chs[-1:] + stage_out_chs[:-1]
+        stage_args = dict(
+            residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], norm_layer=norm_layer)
+        stages = []
+        for i in range(4):  # num_stages
+            downsample = stem_stride == 2 or i > 0  # first stage has no stride/downsample if stem_stride is 4
+            stages += [OsaStage(
+                in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block,
+                downsample=downsample, **stage_args)
+            ]
+            self.num_features = stage_out_chs[i]
+        self.stages = nn.Sequential(*stages)
+
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.)
+                nn.init.constant_(m.bias, 0.)
+            elif isinstance(m, nn.Linear):
+                nn.init.zeros_(m.bias)
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        return self.stages(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return self.head(x)
+
+
+def _vovnet(variant, pretrained=False, **kwargs):
+    load_strict = True
+    model_class = VovNet
+    if kwargs.pop('features_only', False):
+        assert False, 'Not Implemented'  # TODO
+        load_strict = False
+        kwargs.pop('num_classes', 0)
+    model_cfg = model_cfgs[variant]
+    default_cfg = default_cfgs[variant]
+    model = model_class(model_cfg, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(
+            model, default_cfg,
+            num_classes=kwargs.get('num_classes', 0), in_chans=kwargs.get('in_chans', 3), strict=load_strict)
+    return model
+
+
+
+@register_model
+def vovnet39a(pretrained=False, **kwargs):
+    return _vovnet('vovnet39a', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def vovnet57a(pretrained=False, **kwargs):
+    return _vovnet('vovnet57a', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_slim_dw(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet19b_slim_dw', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_dw(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet19b_dw', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet19b_slim(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet19b_slim', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet39b(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet39b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet57b(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet57b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def ese_vovnet99b(pretrained=False, **kwargs):
+    return _vovnet('ese_vovnet99b', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def eca_vovnet39b(pretrained=False, **kwargs):
+    return _vovnet('eca_vovnet39b', pretrained=pretrained, **kwargs)
+
+
+# Experimental Models
+
+@register_model
+def ese_vovnet39b_evos(pretrained=False, **kwargs):
+    def norm_act_fn(num_features, **kwargs):
+        return create_norm_act('EvoNormSample', num_features, jit=False, **kwargs)
+    return _vovnet('ese_vovnet39b_evos', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs)
+
+@register_model
+def ese_vovnet99b_iabn(pretrained=False, **kwargs):
+    norm_layer = get_norm_act_layer('iabn')
+    return _vovnet('ese_vovnet99b_iabn', pretrained=pretrained, norm_layer=norm_layer, **kwargs)
--- a/timm/models/xception.py
+++ b/timm/models/xception.py
@ -21,15 +21,13 @@ normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],

 The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
 """
-import math

-import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .registry import register_model
 from .helpers import load_pretrained
 from .layers import SelectAdaptivePool2d
+from .registry import register_model

 __all__ = ['Xception']

@ -181,8 +179,11 @@ class Xception(nn.Module):
    def reset_classifier(self, num_classes, global_pool='avg'):
        self.num_classes = num_classes
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        del self.fc
-        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes) if num_classes else None
+        if num_classes:
+            num_features = self.num_features * self.global_pool.feat_mult()
+            self.fc = nn.Linear(num_features, num_classes)
+        else:
+            self.fc = nn.Identity()

    def forward_features(self, x):
        x = self.conv1(x)
--- a/timm/scheduler/scheduler_factory.py
+++ b/timm/scheduler/scheduler_factory.py
@ -23,12 +23,12 @@ def create_scheduler(args, optimizer):
        lr_scheduler = CosineLRScheduler(
            optimizer,
            t_initial=num_epochs,
-            t_mul=1.0,
+            t_mul=args.lr_cycle_mul,
            lr_min=args.min_lr,
            decay_rate=args.decay_rate,
            warmup_lr_init=args.warmup_lr,
            warmup_t=args.warmup_epochs,
-            cycle_limit=1,
+            cycle_limit=args.lr_cycle_limit,
            t_in_epochs=True,
            noise_range_t=noise_range,
            noise_pct=args.lr_noise_pct,
@ -40,11 +40,11 @@ def create_scheduler(args, optimizer):
        lr_scheduler = TanhLRScheduler(
            optimizer,
            t_initial=num_epochs,
-            t_mul=1.0,
+            t_mul=args.lr_cycle_mul,
            lr_min=args.min_lr,
            warmup_lr_init=args.warmup_lr,
            warmup_t=args.warmup_epochs,
-            cycle_limit=1,
+            cycle_limit=args.lr_cycle_limit,
            t_in_epochs=True,
            noise_range_t=noise_range,
            noise_pct=args.lr_noise_pct,
--- a/timm/version.py
+++ b/timm/version.py
@ -1 +1 @@
-__version__ = '0.1.26'
+__version__ = '0.1.28'
--- a/train.py
+++ b/train.py
@ -111,6 +111,10 @@ parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT
                    help='learning rate noise limit percent (default: 0.67)')
 parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
                    help='learning rate noise std-dev (default: 1.0)')
+parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                    help='learning rate cycle len multiplier (default: 1.0)')
+parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                    help='learning rate cycle limit')
 parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
                    help='warmup learning rate (default: 0.0001)')
 parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
--- a/validate.py
+++ b/validate.py
@ -24,7 +24,8 @@ try:
 except ImportError:
    has_apex = False

-from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models
+from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models,\
+    set_scriptable, set_no_jit
 from timm.data import Dataset, DatasetTar, create_loader, resolve_data_config
 from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging

@ -87,9 +88,10 @@ def validate(args):
    # create model
    model = create_model(
        args.model,
+        pretrained=args.pretrained,
        num_classes=args.num_classes,
        in_chans=3,
-        pretrained=args.pretrained)
+        scriptable=args.torchscript)

    if args.checkpoint:
        load_checkpoint(model, args.checkpoint, args.use_ema)
@ -141,8 +143,11 @@ def validate(args):
    top5 = AverageMeter()

    model.eval()
-    end = time.time()
    with torch.no_grad():
+        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+        input = torch.randn((args.batch_size,) + data_config['input_size']).cuda()
+        model(input)
+        end = time.time()
        for i, (input, target) in enumerate(loader):
            if args.no_prefetcher:
                target = target.cuda()
@ -234,6 +239,7 @@ def main():
                            raise e
                        batch_size = max(batch_size // 2, args.num_gpu)
                        print("Validation failed, reducing batch size by 50%")
+                        torch.cuda.empty_cache()
                result.update(r)
                if args.checkpoint:
                    result['checkpoint'] = args.checkpoint