Merge pull request #88 from rwightman/attention

A lot of attention and much more
5 years ago · e0685dd415
parent f098fda2ca f1860ef3a5
commit e0685dd415
53 changed files with 1660 additions and 551 deletions
--- a/.gitignore
+++ b/.gitignore
@ -104,3 +104,5 @@ venv.bak/
 *.tar
 *.pth
 *.gz
+Untitled.ipynb
+Testing notebook.ipynb
--- a/README.md
+++ b/README.md
@ -2,6 +2,20 @@

 ## What's New

+### Feb 18, 2020
+* Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268):
+  * Move layer/module impl into `layers` subfolder/module of `models` and organize in a more granular fashion
+  * ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks
+  * Add Selective Kernel Nets on top of ResNet base, pretrained weights
+    * skresnet18 - 73% top-1
+    * skresnet34 - 76.9% top-1 
+    * skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1
+  * ECA and CECA (circular padding) attention layer contributed by [Chris Ha](https://github.com/VRandme)
+  * CBAM attention experiment (not the best results so far, may remove)
+  * Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the `.se` position for all ResNets
+  * Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants
+* Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights
+
 ### Feb 12, 2020
 * Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)

@ -86,6 +100,7 @@ Included models:
    * 'Bag of Tricks' / Gluon C, D, E, S variations (https://arxiv.org/abs/1812.01187)
    * Instagram trained / ImageNet tuned ResNeXt101-32x8d to 32x48d from from [facebookresearch](https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/)
    * Res2Net (https://github.com/gasvn/Res2Net, https://arxiv.org/abs/1904.01169)
+    * Selective Kernel (SK) Nets (https://arxiv.org/abs/1903.06586)
 * DLA
    * Original (https://github.com/ucbdrive/dla, https://arxiv.org/abs/1707.06484)
    * Res2Net (https://github.com/gasvn/Res2Net, https://arxiv.org/abs/1904.01169)
@ -138,6 +153,8 @@ Several (less common) features that I often utilize in my projects are included.
 * AutoAugment (https://arxiv.org/abs/1805.09501) and RandAugment (https://arxiv.org/abs/1909.13719) ImageNet configurations modeled after impl for EfficientNet training (https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py)
 * AugMix w/ JSD loss (https://arxiv.org/abs/1912.02781), JSD w/ clean + augmented mixing support works with AutoAugment and RandAugment as well
 * SplitBachNorm - allows splitting batch norm layers between clean and augmented (auxiliary batch norm) data
+* DropBlock (https://arxiv.org/abs/1810.12890)
+* Efficient Channel Attention - ECA (https://arxiv.org/abs/1910.03151)

 ## Results

@ -150,9 +167,11 @@ I've leveraged the training scripts in this repository to train a few of the mod
 |---|---|---|---|---|---|
 | efficientnet_b3a | 81.874 (18.126) | 95.840 (4.160) | 12.23M | bicubic | 320 (1.0 crop) |
 | efficientnet_b3 | 81.498 (18.502) | 95.718 (4.282) | 12.23M | bicubic | 300 |
+| skresnext50d_32x4d | 81.278 (18.722) | 95.366 (4.634) | 27.5M | bicubic | 288 (1.0 crop) |
 | efficientnet_b2a | 80.608 (19.392) | 95.310 (4.690) | 9.11M | bicubic | 288 (1.0 crop) |
 | mixnet_xl | 80.478 (19.522) | 94.932 (5.068) | 11.90M | bicubic | 224 |
 | efficientnet_b2 | 80.402 (19.598) | 95.076 (4.924) | 9.11M | bicubic | 260 |
+| skresnext50d_32x4d | 80.156 (19.844) | 94.642 (5.358) | 27.5M | bicubic | 224 |
 | resnext50d_32x4d | 79.674 (20.326) | 94.868 (5.132) | 25.1M | bicubic | 224 |
 | resnet50 | 79.038 (20.962) | 94.390 (5.610) | 25.6M | bicubic | 224 |
 | mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33M | bicubic | 224 |
@ -165,6 +184,7 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | seresnext26d_32x4d | 77.602 (22.398) | 93.608 (6.392) | 16.8M | bicubic | 224 |
 | mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01M | bicubic | 224 |
 | seresnext26_32x4d | 77.104 (22.896) | 93.316 (6.684) | 16.8M | bicubic | 224 |
+| skresnet34 | 76.912 (23.088) | 93.322 (6.678) | 22.2M | bicubic | 224 |
 | resnet26d | 76.68 (23.32) | 93.166 (6.834) | 16M | bicubic | 224 |
 | mixnet_s | 75.988 (24.012) | 92.794 (7.206) | 4.13M | bicubic | 224 |
 | mobilenetv3_100 | 75.634 (24.366) | 92.708 (7.292) | 5.5M | bicubic | 224 |
@ -175,6 +195,7 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | seresnet34 | 74.808 (25.192) | 92.124 (7.876) | 22M | bilinear | 224 |
 | mnasnet_b1 | 74.658 (25.342) | 92.114 (7.886) | 4.38M | bicubic | 224 |
 | spnasnet_100 | 74.084 (25.916)  | 91.818 (8.182) | 4.42M | bilinear | 224 |
+| skresnet18 | 73.038 (26.962) | 91.168 (8.832) | 11.9M | bicubic | 224 |
 | seresnet18 | 71.742 (28.258) | 90.334 (9.666) | 11.8M | bicubic | 224 |

 ### Ported Weights
--- a/hubconf.py
+++ b/hubconf.py
@ -0,0 +1,10 @@
+dependencies = ['torch']
+
+from timm.models import registry
+
+current_module = __import__(__name__)
+current_module.__dict__.update(registry._model_entrypoints)
+#for fn_name in registry.list_models():
+#    fn = registry.model_entrypoint(fn_name)
+#    setattr(current_module, fn_name, fn)
+
--- a/results/results-imagenet-a.csv
+++ b/results/results-imagenet-a.csv
@ -1,8 +1,14 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
+tf_efficientnet_l2_ns_475,62.3733,37.6267,87.0933,12.9067,480.31,475,0.936,bicubic
+tf_efficientnet_l2_ns,62.0267,37.9733,87.96,12.04,480.31,800,0.96,bicubic
+tf_efficientnet_b7_ns,45.76,54.24,74.2133,25.7867,66.35,600,0.949,bicubic
 ig_resnext101_32x48d,41.56,58.44,66.5467,33.4533,828.41,224,0.875,bilinear
+tf_efficientnet_b6_ns,40.4533,59.5467,68.8667,31.1333,43.04,528,0.942,bicubic
 ig_resnext101_32x32d,39.4267,60.5733,63.7867,36.2133,468.53,224,0.875,bilinear
-ig_resnext101_32x16d,36.0,64.0,59.0,41.0,194.03,224,0.875,bilinear
+tf_efficientnet_b5_ns,39.0133,60.9867,68.08,31.92,30.39,456,0.934,bicubic
+ig_resnext101_32x16d,36,64,59,41,194.03,224,0.875,bilinear
 swsl_resnext101_32x8d,32.0133,67.9867,59.44,40.56,88.79,224,0.875,bilinear
+tf_efficientnet_b4_ns,30.7867,69.2133,59.4667,40.5333,19.34,380,0.922,bicubic
 tf_efficientnet_b8_ap,29.5867,70.4133,56.9333,43.0667,87.41,672,0.954,bicubic
 tf_efficientnet_b8,29.3867,70.6133,57.0533,42.9467,87.41,672,0.954,bicubic
 ig_resnext101_32x8d,28.6667,71.3333,52.32,47.68,88.79,224,0.875,bilinear
@ -13,6 +19,7 @@ tf_efficientnet_b7,25.28,74.72,51.6667,48.3333,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ap,24.3467,75.6533,50.44,49.56,43.04,528,0.942,bicubic
 tf_efficientnet_b6,20.3733,79.6267,45.48,54.52,43.04,528,0.942,bicubic
 tf_efficientnet_b5_ap,19.4667,80.5333,44.7333,55.2667,30.39,456,0.934,bicubic
+tf_efficientnet_b3_ns,19.44,80.56,44.6533,55.3467,12.23,300,0.904,bicubic
 swsl_resnext50_32x4d,18.04,81.96,41.9733,58.0267,25.03,224,0.875,bilinear
 ssl_resnext101_32x16d,17.1867,82.8133,39.9333,60.0667,194.03,224,0.875,bilinear
 tf_efficientnet_b5,17.0533,82.9467,41.92,58.08,30.39,456,0.934,bicubic
@ -23,13 +30,15 @@ tf_efficientnet_b4,13.32,86.68,35.5333,64.4667,19.34,380,0.922,bicubic
 pnasnet5large,13.0533,86.9467,32.2267,67.7733,86.06,331,0.875,bicubic
 nasnetalarge,12.56,87.44,33.4267,66.5733,88.75,331,0.875,bicubic
 ssl_resnext101_32x4d,12.1067,87.8933,31.8933,68.1067,44.18,224,0.875,bilinear
+tf_efficientnet_b2_ns,11.7333,88.2667,32.96,67.04,9.11,260,0.89,bicubic
 gluon_senet154,9.8933,90.1067,26.4267,73.5733,115.09,224,0.875,bicubic
 ssl_resnext50_32x4d,9.6533,90.3467,28.4667,71.5333,25.03,224,0.875,bilinear
 senet154,9.4667,90.5333,26.44,73.56,115.09,224,0.875,bilinear
-efficientnet_b3a,9.2533,90.7467,28.4267,71.5733,12.23,320,1.0,bicubic
+efficientnet_b3a,9.2533,90.7467,28.4267,71.5733,12.23,320,1,bicubic
 efficientnet_b3,8.9733,91.0267,28.2267,71.7733,12.23,300,0.904,bicubic
 inception_v4,8.8933,91.1067,24.68,75.32,42.68,299,0.875,bicubic
 gluon_seresnext101_64x4d,8.8667,91.1333,27.28,72.72,88.23,224,0.875,bicubic
+tf_efficientnet_b1_ns,8.6133,91.3867,27.2933,72.7067,7.79,240,0.882,bicubic
 gluon_xception65,8.44,91.56,25.12,74.88,39.92,299,0.875,bicubic
 gluon_resnet152_v1d,8.36,91.64,23.4267,76.5733,60.21,224,0.875,bicubic
 inception_resnet_v2,8.1733,91.8267,23.5733,76.4267,55.84,299,0.8975,bicubic
@ -39,14 +48,15 @@ tf_efficientnet_b3,8.0133,91.9867,25.48,74.52,12.23,300,0.904,bicubic
 ens_adv_inception_resnet_v2,7.9733,92.0267,23.8667,76.1333,55.84,299,0.8975,bicubic
 gluon_resnet152_v1s,7.8533,92.1467,23.1867,76.8133,60.32,224,0.875,bicubic
 gluon_resnext101_64x4d,7.72,92.28,23.3067,76.6933,83.46,224,0.875,bicubic
+skresnext50_32x4d,7.08,92.92,23.0667,76.9333,27.48,224,0.875,bicubic
 ssl_resnet50,7.04,92.96,23.9067,76.0933,25.56,224,0.875,bilinear
-efficientnet_b2a,6.7467,93.2533,23.5067,76.4933,9.11,288,1.0,bicubic
+efficientnet_b2a,6.7467,93.2533,23.5067,76.4933,9.11,288,1,bicubic
 seresnext101_32x4d,6.4,93.6,21.4933,78.5067,48.96,224,0.875,bilinear
 efficientnet_b2,6.0933,93.9067,21.96,78.04,9.11,260,0.875,bicubic
 gluon_resnext101_32x4d,6.0133,93.9867,21.12,78.88,44.18,224,0.875,bicubic
 gluon_resnet101_v1d,5.92,94.08,19.9467,80.0533,44.57,224,0.875,bicubic
 gluon_seresnext50_32x4d,5.7867,94.2133,21.4533,78.5467,27.56,224,0.875,bicubic
-gluon_inception_v3,5.5067,94.4933,20.0,80.0,23.83,299,0.875,bicubic
+gluon_inception_v3,5.5067,94.4933,20,80,23.83,299,0.875,bicubic
 mixnet_xl,5.4667,94.5333,21.08,78.92,11.9,224,0.875,bicubic
 gluon_resnet101_v1s,5.28,94.72,19.56,80.44,44.67,224,0.875,bicubic
 hrnet_w64,5.16,94.84,19.4933,80.5067,128.06,224,0.875,bilinear
@ -69,8 +79,9 @@ inception_v3,4.1867,95.8133,16.2933,83.7067,27.16,299,0.875,bicubic
 tf_efficientnet_b2_ap,4.16,95.84,18.3467,81.6533,9.11,260,0.89,bicubic
 seresnet152,4.1467,95.8533,15.9333,84.0667,66.82,224,0.875,bilinear
 resnext101_32x8d,4.1333,95.8667,16.92,83.08,88.79,224,0.875,bilinear
+tf_efficientnet_b0_ns,4.1333,95.8667,17.68,82.32,5.29,224,0.875,bicubic
 dpn98,4.08,95.92,15.96,84.04,61.57,224,0.875,bicubic
-res2net101_26w_4s,4.0,96.0,14.8667,85.1333,45.21,224,0.875,bilinear
+res2net101_26w_4s,4,96,14.8667,85.1333,45.21,224,0.875,bilinear
 efficientnet_b1,3.9733,96.0267,15.7733,84.2267,7.79,240,0.875,bicubic
 tf_efficientnet_b2,3.76,96.24,16.5867,83.4133,9.11,260,0.89,bicubic
 hrnet_w30,3.68,96.32,15.5733,84.4267,37.71,224,0.875,bilinear
@ -102,6 +113,7 @@ dla60_res2net,2.64,97.36,14.1733,85.8267,21.15,224,0.875,bilinear
 gluon_resnet101_v1b,2.6133,97.3867,13.56,86.44,44.55,224,0.875,bicubic
 dla60x,2.6,97.4,13.3467,86.6533,17.65,224,0.875,bilinear
 mixnet_m,2.5467,97.4533,12.4133,87.5867,5.01,224,0.875,bicubic
+efficientnet_es,2.3733,97.6267,13.8267,86.1733,5.44,224,0.875,bicubic
 resnet152,2.36,97.64,12.2,87.8,60.19,224,0.875,bilinear
 swsl_resnet18,2.3467,97.6533,11.2267,88.7733,11.69,224,0.875,bilinear
 wide_resnet50_2,2.32,97.68,11.8267,88.1733,68.88,224,0.875,bilinear
@ -133,7 +145,7 @@ ssl_resnet18,1.3867,98.6133,8.2,91.8,11.69,224,0.875,bilinear
 dla60,1.3333,98.6667,9.4667,90.5333,22.33,224,0.875,bilinear
 dpn68,1.32,98.68,8.8267,91.1733,12.61,224,0.875,bicubic
 res2net50_48w_2s,1.2933,98.7067,8.9333,91.0667,25.29,224,0.875,bilinear
-tf_mixnet_s,1.2667,98.7333,8.7467,91.2533,4.13,224,0.875,bicubic
+tf_mixnet_s,1.2667,98.7333,8.7333,91.2667,4.13,224,0.875,bicubic
 fbnetc_100,1.24,98.76,8.76,91.24,5.57,224,0.875,bilinear
 resnet26d,1.24,98.76,9.32,90.68,16.01,224,0.875,bicubic
 tf_mobilenetv3_large_100,1.1867,98.8133,7.9467,92.0533,5.48,224,0.875,bilinear
@ -143,9 +155,10 @@ seresnet34,1.12,98.88,7.4267,92.5733,21.96,224,0.875,bilinear
 tf_efficientnet_es,1.12,98.88,8.5867,91.4133,5.44,224,0.875,bicubic
 spnasnet_100,1.1067,98.8933,8.2133,91.7867,4.42,224,0.875,bilinear
 dla34,1.08,98.92,7.68,92.32,15.78,224,0.875,bilinear
-resnet34,1.0,99.0,7.5333,92.4667,21.8,224,0.875,bilinear
+resnet34,1,99,7.5333,92.4667,21.8,224,0.875,bilinear
 gluon_resnet34_v1b,0.8933,99.1067,6.6,93.4,21.8,224,0.875,bicubic
 hrnet_w18_small_v2,0.8933,99.1067,7.3867,92.6133,15.6,224,0.875,bilinear
+skresnet18,0.88,99.12,7.3467,92.6533,11.96,224,0.875,bicubic
 tf_mobilenetv3_large_075,0.88,99.12,6.72,93.28,3.99,224,0.875,bilinear
 mnasnet_100,0.8667,99.1333,7.8267,92.1733,4.38,224,0.875,bicubic
 tf_mobilenetv3_small_100,0.7467,99.2533,4.6667,95.3333,2.54,224,0.875,bilinear
@ -153,7 +166,7 @@ seresnet18,0.7333,99.2667,6.0267,93.9733,11.78,224,0.875,bicubic
 densenet121,0.68,99.32,6.8933,93.1067,7.98,224,0.875,bicubic
 tf_mobilenetv3_small_075,0.6533,99.3467,4.1867,95.8133,2.04,224,0.875,bilinear
 tv_resnet34,0.6,99.4,5.5333,94.4667,21.8,224,0.875,bilinear
-resnet26,0.5867,99.4133,6.8933,93.1067,16.0,224,0.875,bicubic
+resnet26,0.5867,99.4133,6.8933,93.1067,16,224,0.875,bicubic
 dla46_c,0.52,99.48,4.1733,95.8267,1.31,224,0.875,bilinear
 dla60x_c,0.48,99.52,5.2133,94.7867,1.34,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,0.48,99.52,4.88,95.12,3.92,224,0.875,bilinear
@ -162,4 +175,4 @@ dla46x_c,0.4133,99.5867,4.44,95.56,1.08,224,0.875,bilinear
 gluon_resnet18_v1b,0.3867,99.6133,4.7867,95.2133,11.69,224,0.875,bicubic
 tf_mobilenetv3_small_minimal_100,0.36,99.64,2.8667,97.1333,2.04,224,0.875,bilinear
 resnet18,0.2933,99.7067,4.04,95.96,11.69,224,0.875,bilinear
-tv_resnet50,0.0,100.0,2.9067,97.0933,25.56,224,0.875,bilinear
+tv_resnet50,0,100,2.9067,97.0933,25.56,224,0.875,bilinear
--- a/results/results-imagenet.csv
+++ b/results/results-imagenet.csv
@ -1,7 +1,13 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
+tf_efficientnet_l2_ns,88.352,11.648,98.65,1.35,480.31,800,0.961,bicubic
+tf_efficientnet_l2_ns_475,88.234,11.766,98.546,1.454,480.31,475,0.936,bicubic
+tf_efficientnet_b7_ns,86.84,13.16,98.094,1.906,66.35,600,0.949,bicubic
+tf_efficientnet_b6_ns,86.452,13.548,97.882,2.118,43.04,528,0.942,bicubic
+tf_efficientnet_b5_ns,86.088,13.912,97.752,2.248,30.39,456,0.934,bicubic
 ig_resnext101_32x48d,85.428,14.572,97.572,2.428,828.41,224,0.875,bilinear
 tf_efficientnet_b8,85.37,14.63,97.39,2.61,87.41,672,0.954,bicubic
 tf_efficientnet_b8_ap,85.37,14.63,97.294,2.706,87.41,672,0.954,bicubic
+tf_efficientnet_b4_ns,85.162,14.838,97.47,2.53,19.34,380,0.922,bicubic
 tf_efficientnet_b7_ap,85.12,14.88,97.252,2.748,66.35,600,0.949,bicubic
 ig_resnext101_32x32d,85.094,14.906,97.438,2.562,468.53,224,0.875,bilinear
 tf_efficientnet_b7,84.936,15.064,97.204,2.796,66.35,600,0.949,bicubic
@ -10,6 +16,7 @@ swsl_resnext101_32x8d,84.284,15.716,97.176,2.824,88.79,224,0.875,bilinear
 tf_efficientnet_b5_ap,84.252,15.748,96.974,3.026,30.39,456,0.934,bicubic
 ig_resnext101_32x16d,84.17,15.83,97.196,2.804,194.03,224,0.875,bilinear
 tf_efficientnet_b6,84.11,15.89,96.886,3.114,43.04,528,0.942,bicubic
+tf_efficientnet_b3_ns,84.048,15.952,96.91,3.09,12.23,300,0.904,bicubic
 tf_efficientnet_b5,83.812,16.188,96.748,3.252,30.39,456,0.934,bicubic
 swsl_resnext101_32x16d,83.346,16.654,96.846,3.154,194.03,224,0.875,bilinear
 tf_efficientnet_b4_ap,83.248,16.752,96.392,3.608,19.34,380,0.922,bicubic
@ -18,6 +25,7 @@ tf_efficientnet_b4,83.022,16.978,96.3,3.7,19.34,380,0.922,bicubic
 pnasnet5large,82.736,17.264,96.046,3.954,86.06,331,0.875,bicubic
 ig_resnext101_32x8d,82.688,17.312,96.636,3.364,88.79,224,0.875,bilinear
 nasnetalarge,82.554,17.446,96.038,3.962,88.75,331,0.875,bicubic
+tf_efficientnet_b2_ns,82.38,17.62,96.248,3.752,9.11,260,0.89,bicubic
 swsl_resnext50_32x4d,82.182,17.818,96.23,3.77,25.03,224,0.875,bilinear
 efficientnet_b3a,81.866,18.134,95.836,4.164,12.23,320,1,bicubic
 ssl_resnext101_32x16d,81.844,18.156,96.096,3.904,194.03,224,0.875,bilinear
@ -25,6 +33,7 @@ tf_efficientnet_b3_ap,81.822,18.178,95.624,4.376,12.23,300,0.904,bicubic
 tf_efficientnet_b3,81.636,18.364,95.718,4.282,12.23,300,0.904,bicubic
 ssl_resnext101_32x8d,81.616,18.384,96.038,3.962,88.79,224,0.875,bilinear
 efficientnet_b3,81.494,18.506,95.716,4.284,12.23,300,0.904,bicubic
+tf_efficientnet_b1_ns,81.388,18.612,95.738,4.262,7.79,240,0.882,bicubic
 senet154,81.31,18.69,95.496,4.504,115.09,224,0.875,bilinear
 gluon_senet154,81.234,18.766,95.348,4.652,115.09,224,0.875,bicubic
 swsl_resnet50,81.166,18.834,95.972,4.028,25.56,224,0.875,bilinear
@ -47,9 +56,10 @@ tf_efficientnet_b2_ap,80.3,19.7,95.028,4.972,9.11,260,0.89,bicubic
 seresnext101_32x4d,80.228,19.772,95.018,4.982,48.96,224,0.875,bilinear
 inception_v4,80.168,19.832,94.968,5.032,42.68,299,0.875,bicubic
 dpn107,80.156,19.844,94.91,5.09,86.92,224,0.875,bicubic
+skresnext50_32x4d,80.156,19.844,94.642,5.358,27.48,224,0.875,bicubic
 tf_efficientnet_b2,80.086,19.914,94.908,5.092,9.11,260,0.89,bicubic
 dpn92,80.008,19.992,94.836,5.164,37.67,224,0.875,bicubic
-ens_adv_inception_resnet_v2,79.982,20.018,94.938,5.062,55.84,299,0.8975,bicubic
+ens_adv_inception_resnet_v2,79.982,20.018,94.936,5.064,55.84,299,0.8975,bicubic
 gluon_seresnext50_32x4d,79.918,20.082,94.822,5.178,27.56,224,0.875,bicubic
 gluon_resnet152_v1c,79.91,20.09,94.84,5.16,60.21,224,0.875,bicubic
 dpn131,79.822,20.178,94.71,5.29,79.25,224,0.875,bicubic
@ -85,6 +95,7 @@ tf_efficientnet_em,78.708,21.292,94.314,5.686,6.9,240,0.882,bicubic
 efficientnet_b1,78.698,21.302,94.144,5.856,7.79,240,0.875,bicubic
 dla169,78.688,21.312,94.336,5.664,53.99,224,0.875,bilinear
 seresnet152,78.66,21.34,94.37,5.63,66.82,224,0.875,bilinear
+tf_efficientnet_b0_ns,78.658,21.342,94.376,5.624,5.29,224,0.875,bicubic
 res2net50_26w_6s,78.57,21.43,94.124,5.876,37.05,224,0.875,bilinear
 resnext50_32x4d,78.512,21.488,94.042,5.958,25.03,224,0.875,bicubic
 dla102x,78.51,21.49,94.228,5.772,26.77,224,0.875,bilinear
@ -99,6 +110,7 @@ dla60x,78.246,21.754,94.018,5.982,17.65,224,0.875,bilinear
 res2next50,78.246,21.754,93.892,6.108,24.67,224,0.875,bilinear
 hrnet_w30,78.206,21.794,94.222,5.778,37.71,224,0.875,bilinear
 res2net50_14w_8s,78.15,21.85,93.848,6.152,25.06,224,0.875,bilinear
+efficientnet_es,78.066,21.934,93.926,6.074,5.44,224,0.875,bicubic
 dla102,78.032,21.968,93.946,6.054,33.73,224,0.875,bilinear
 gluon_resnet50_v1c,78.012,21.988,93.988,6.012,25.58,224,0.875,bicubic
 seresnext26t_32x4d,77.998,22.002,93.708,6.292,16.82,224,0.875,bicubic
@ -115,7 +127,7 @@ adv_inception_v3,77.582,22.418,93.736,6.264,23.83,299,0.875,bicubic
 gluon_resnet50_v1b,77.58,22.42,93.716,6.284,25.56,224,0.875,bicubic
 res2net50_48w_2s,77.522,22.478,93.554,6.446,25.29,224,0.875,bilinear
 dpn68b,77.512,22.488,93.822,6.178,12.61,224,0.875,bicubic
-inception_v3,77.438,22.562,93.474,6.526,27.16,299,0.875,bicubic
+inception_v3,77.44,22.56,93.474,6.526,27.16,299,0.875,bicubic
 resnet101,77.374,22.626,93.54,6.46,44.55,224,0.875,bilinear
 densenet161,77.358,22.642,93.638,6.362,28.68,224,0.875,bicubic
 tf_efficientnet_cc_b0_4e,77.306,22.694,93.334,6.666,13.31,224,0.875,bicubic
@ -151,11 +163,12 @@ spnasnet_100,74.084,25.916,91.818,8.182,4.42,224,0.875,bilinear
 tf_mobilenetv3_large_075,73.438,26.562,91.35,8.65,3.99,224,0.875,bilinear
 tv_resnet34,73.312,26.688,91.426,8.574,21.8,224,0.875,bilinear
 swsl_resnet18,73.276,26.724,91.734,8.266,11.69,224,0.875,bilinear
+skresnet18,73.038,26.962,91.168,8.832,11.96,224,0.875,bicubic
 ssl_resnet18,72.61,27.39,91.416,8.584,11.69,224,0.875,bilinear
 hrnet_w18_small,72.342,27.658,90.678,9.322,13.19,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,72.248,27.752,90.63,9.37,3.92,224,0.875,bilinear
 seresnet18,71.742,28.258,90.334,9.666,11.78,224,0.875,bicubic
-gluon_resnet18_v1b,70.836,29.164,89.76,10.24,11.69,224,0.875,bicubic
+gluon_resnet18_v1b,70.836,29.164,89.762,10.238,11.69,224,0.875,bicubic
 resnet18,69.748,30.252,89.078,10.922,11.69,224,0.875,bilinear
 tf_mobilenetv3_small_100,67.922,32.078,87.664,12.336,2.54,224,0.875,bilinear
 dla60x_c,67.892,32.108,88.426,11.574,1.34,224,0.875,bilinear
--- a/results/results-imagenetv2-matched-frequency.csv
+++ b/results/results-imagenetv2-matched-frequency.csv
@ -1,9 +1,15 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
+tf_efficientnet_l2_ns_475,80.46,19.54,95.73,4.27,480.31,475,0.936,bicubic
+tf_efficientnet_l2_ns,80.25,19.75,95.84,4.16,480.31,800,0.96,bicubic
+tf_efficientnet_b7_ns,78.51,21.49,94.38,5.62,66.35,600,0.949,bicubic
+tf_efficientnet_b6_ns,77.28,22.72,93.89,6.11,43.04,528,0.942,bicubic
 ig_resnext101_32x48d,76.87,23.13,93.31,6.69,828.41,224,0.875,bilinear
 ig_resnext101_32x32d,76.84,23.16,93.2,6.8,468.53,224,0.875,bilinear
+tf_efficientnet_b5_ns,76.81,23.19,93.58,6.42,30.39,456,0.934,bicubic
 tf_efficientnet_b7_ap,76.09,23.91,92.97,7.03,66.35,600,0.949,bicubic
 tf_efficientnet_b8_ap,76.09,23.91,92.73,7.27,87.41,672,0.954,bicubic
 ig_resnext101_32x16d,75.72,24.28,92.91,7.09,194.03,224,0.875,bilinear
+tf_efficientnet_b4_ns,75.67,24.33,93.05,6.95,19.34,380,0.922,bicubic
 swsl_resnext101_32x8d,75.43,24.57,92.76,7.24,88.79,224,0.875,bilinear
 tf_efficientnet_b6_ap,75.38,24.62,92.44,7.56,43.04,528,0.942,bicubic
 tf_efficientnet_b8,74.94,25.06,92.31,7.69,87.41,672,0.954,bicubic
@ -12,6 +18,7 @@ tf_efficientnet_b5_ap,74.6,25.4,91.99,8.01,30.39,456,0.934,bicubic
 swsl_resnext101_32x4d,74.14,25.86,91.99,8.01,44.18,224,0.875,bilinear
 swsl_resnext101_32x16d,74.02,25.98,92.16,7.84,194.03,224,0.875,bilinear
 tf_efficientnet_b6,73.9,26.1,91.75,8.25,43.04,528,0.942,bicubic
+tf_efficientnet_b3_ns,73.89,26.11,91.87,8.13,12.23,300,0.904,bicubic
 ig_resnext101_32x8d,73.65,26.35,92.19,7.81,88.79,224,0.875,bilinear
 tf_efficientnet_b5,73.55,26.45,91.46,8.54,30.39,456,0.934,bicubic
 tf_efficientnet_b4_ap,72.89,27.11,90.98,9.02,19.34,380,0.922,bicubic
@ -19,11 +26,13 @@ swsl_resnext50_32x4d,72.56,27.44,90.87,9.13,25.03,224,0.875,bilinear
 pnasnet5large,72.38,27.62,90.24,9.76,86.06,331,0.875,bicubic
 nasnetalarge,72.32,27.68,90.53,9.47,88.75,331,0.875,bicubic
 tf_efficientnet_b4,72.29,27.71,90.59,9.41,19.34,380,0.922,bicubic
+tf_efficientnet_b2_ns,72.28,27.72,91.09,8.91,9.11,260,0.89,bicubic
 swsl_resnet50,71.7,28.3,90.5,9.5,25.56,224,0.875,bilinear
 ssl_resnext101_32x8d,71.5,28.5,90.46,9.54,88.79,224,0.875,bilinear
 ssl_resnext101_32x16d,71.41,28.59,90.56,9.44,194.03,224,0.875,bilinear
 tf_efficientnet_b3_ap,70.92,29.08,89.43,10.57,12.23,300,0.904,bicubic
 efficientnet_b3a,70.87,29.13,89.72,10.28,12.23,320,1.0,bicubic
+tf_efficientnet_b1_ns,70.87,29.13,90.12,9.88,7.79,240,0.882,bicubic
 efficientnet_b3,70.76,29.24,89.85,10.15,12.23,300,0.904,bicubic
 tf_efficientnet_b3,70.64,29.36,89.44,10.56,12.23,300,0.904,bicubic
 gluon_senet154,70.6,29.4,88.92,11.08,115.09,224,0.875,bicubic
@ -31,7 +40,7 @@ ssl_resnext101_32x4d,70.53,29.47,89.76,10.24,44.18,224,0.875,bilinear
 senet154,70.5,29.5,89.01,10.99,115.09,224,0.875,bilinear
 gluon_seresnext101_64x4d,70.43,29.57,89.35,10.65,88.23,224,0.875,bicubic
 gluon_resnet152_v1s,70.29,29.71,88.85,11.15,60.32,224,0.875,bicubic
-inception_resnet_v2,70.12,29.88,88.69,11.31,55.84,299,0.8975,bicubic
+inception_resnet_v2,70.12,29.88,88.7,11.3,55.84,299,0.8975,bicubic
 gluon_seresnext101_32x4d,70.01,29.99,88.9,11.1,48.96,224,0.875,bicubic
 gluon_resnet152_v1d,69.96,30.04,88.49,11.51,60.21,224,0.875,bicubic
 ssl_resnext50_32x4d,69.71,30.29,89.44,10.56,25.03,224,0.875,bilinear
@ -57,6 +66,7 @@ gluon_seresnext50_32x4d,68.67,31.33,88.31,11.69,27.56,224,0.875,bicubic
 hrnet_w64,68.64,31.36,88.05,11.95,128.06,224,0.875,bilinear
 dpn98,68.59,31.41,87.68,12.32,61.57,224,0.875,bicubic
 ssl_resnet50,68.41,31.59,88.56,11.44,25.56,224,0.875,bilinear
+skresnext50_32x4d,68.35,31.65,87.57,12.43,27.48,224,0.875,bicubic
 dla102x2,68.33,31.67,87.89,12.11,41.75,224,0.875,bilinear
 gluon_resnext50_32x4d,68.31,31.69,87.3,12.7,25.03,224,0.875,bicubic
 tf_efficientnet_el,68.18,31.82,88.35,11.65,10.59,300,0.904,bicubic
@ -66,6 +76,7 @@ resnext101_32x8d,67.86,32.14,87.49,12.51,88.79,224,0.875,bilinear
 seresnext50_32x4d,67.84,32.16,87.62,12.38,27.56,224,0.875,bilinear
 hrnet_w48,67.77,32.23,87.42,12.58,77.47,224,0.875,bilinear
 hrnet_w44,67.74,32.26,87.56,12.44,67.06,224,0.875,bilinear
+tf_efficientnet_b0_ns,67.71,32.29,88.07,11.93,5.29,224,0.875,bicubic
 xception,67.65,32.35,87.57,12.43,22.86,299,0.8975,bicubic
 dla169,67.61,32.39,87.59,12.41,53.99,224,0.875,bilinear
 gluon_inception_v3,67.59,32.41,87.47,12.53,23.83,299,0.875,bicubic
@ -87,6 +98,7 @@ dla60_res2net,67.02,32.98,87.16,12.84,21.15,224,0.875,bilinear
 dla102x,67.01,32.99,86.77,13.23,26.77,224,0.875,bilinear
 mixnet_l,66.94,33.06,86.91,13.09,7.33,224,0.875,bicubic
 res2net50_26w_6s,66.91,33.09,86.86,13.14,37.05,224,0.875,bilinear
+efficientnet_es,66.88,33.12,86.73,13.27,5.44,224,0.875,bicubic
 tf_efficientnet_b1,66.88,33.12,87.01,12.99,7.79,240,0.882,bicubic
 tf_efficientnet_em,66.88,33.12,86.97,13.03,6.9,240,0.882,bicubic
 resnext50_32x4d,66.87,33.13,86.34,13.66,25.03,224,0.875,bicubic
@ -95,7 +107,7 @@ tf_mixnet_l,66.78,33.22,86.47,13.53,7.33,224,0.875,bicubic
 selecsls60b,66.76,33.24,86.53,13.47,32.77,224,0.875,bicubic
 hrnet_w32,66.75,33.25,87.3,12.7,41.23,224,0.875,bilinear
 wide_resnet101_2,66.73,33.27,87.03,12.97,126.89,224,0.875,bilinear
-adv_inception_v3,66.65,33.35,86.53,13.47,23.83,299,0.875,bicubic
+adv_inception_v3,66.65,33.35,86.54,13.46,23.83,299,0.875,bicubic
 wide_resnet50_2,66.65,33.35,86.8,13.2,68.88,224,0.875,bilinear
 dla60_res2next,66.64,33.36,87.03,12.97,17.33,224,0.875,bilinear
 gluon_resnet50_v1c,66.56,33.44,86.18,13.82,25.58,224,0.875,bicubic
@ -106,7 +118,7 @@ seresnet50,66.25,33.75,86.33,13.67,28.09,224,0.875,bilinear
 selecsls60,66.21,33.79,86.34,13.66,30.67,224,0.875,bicubic
 tv_resnext50_32x4d,66.18,33.82,86.04,13.96,25.03,224,0.875,bilinear
 tf_efficientnet_cc_b0_8e,66.17,33.83,86.24,13.76,24.01,224,0.875,bicubic
-inception_v3,66.15,33.85,86.33,13.67,27.16,299,0.875,bicubic
+inception_v3,66.16,33.84,86.32,13.68,27.16,299,0.875,bicubic
 res2net50_26w_4s,66.14,33.86,86.6,13.4,25.7,224,0.875,bilinear
 gluon_resnet50_v1b,66.07,33.93,86.26,13.74,25.56,224,0.875,bicubic
 res2net50_14w_8s,66.02,33.98,86.25,13.75,25.06,224,0.875,bilinear
@ -151,6 +163,7 @@ mnasnet_100,61.9,38.1,83.71,16.29,4.38,224,0.875,bicubic
 ssl_resnet18,61.48,38.52,83.3,16.7,11.69,224,0.875,bilinear
 spnasnet_100,61.22,38.78,82.79,17.21,4.42,224,0.875,bilinear
 tv_resnet34,61.19,38.81,82.71,17.29,21.8,224,0.875,bilinear
+skresnet18,60.86,39.14,82.88,17.12,11.96,224,0.875,bicubic
 tf_mobilenetv3_large_075,60.4,39.6,81.95,18.05,3.99,224,0.875,bilinear
 seresnet18,59.8,40.2,81.69,18.31,11.78,224,0.875,bicubic
 tf_mobilenetv3_large_minimal_100,59.07,40.93,81.15,18.85,3.92,224,0.875,bilinear
--- a/results/results-sketch.csv
+++ b/results/results-sketch.csv
@ -6,19 +6,28 @@ swsl_resnext101_32x16d,57.4584,42.5416,80.3848,19.6152,194.03,224,0.875,bilinear
 swsl_resnext101_32x8d,56.4385,43.5615,78.9444,21.0556,88.79,224,0.875,bilinear
 ig_resnext101_32x8d,54.9176,45.0824,77.5335,22.4665,88.79,224,0.875,bilinear
 swsl_resnext101_32x4d,53.6029,46.3971,76.3466,23.6534,44.18,224,0.875,bilinear
+tf_efficientnet_l2_ns_475,51.4944,48.5056,73.9276,26.0724,480.31,475,0.936,bicubic
 swsl_resnext50_32x4d,50.4372,49.5628,73.3675,26.6325,25.03,224,0.875,bilinear
 swsl_resnet50,49.5412,50.4588,72.3339,27.6661,25.56,224,0.875,bilinear
+tf_efficientnet_b7_ns,47.8001,52.1999,69.6398,30.3602,66.35,600,0.949,bicubic
+tf_efficientnet_b6_ns,47.7608,52.2392,69.968,30.032,43.04,528,0.942,bicubic
+tf_efficientnet_l2_ns,47.6311,52.3689,70.0033,29.9967,480.31,800,0.961,bicubic
 tf_efficientnet_b8_ap,45.7741,54.2259,67.9106,32.0894,87.41,672,0.954,bicubic
-tf_efficientnet_b8,42.508,57.492,64.857,35.143,87.41,672,0.954,bicubic
+tf_efficientnet_b5_ns,45.615,54.385,67.8418,32.1582,30.39,456,0.934,bicubic
+tf_efficientnet_b4_ns,43.4495,56.5505,65.5191,34.4809,19.34,380,0.922,bicubic
+tf_efficientnet_b8,42.5082,57.4918,64.8568,35.1432,87.41,672,0.954,bicubic
 tf_efficientnet_b7,41.4314,58.5686,63.0175,36.9825,66.35,600,0.949,bicubic
 tf_efficientnet_b7_ap,41.4294,58.5706,62.8741,37.1259,66.35,600,0.949,bicubic
 tf_efficientnet_b5_ap,41.4176,58.5824,62.0841,37.9159,30.39,456,0.934,bicubic
 tf_efficientnet_b6_ap,41.0993,58.9007,62.3553,37.6447,43.04,528,0.942,bicubic
 tf_efficientnet_b4_ap,40.4842,59.5158,61.7226,38.2774,19.34,380,0.922,bicubic
+tf_efficientnet_b3_ns,39.5842,60.4158,61.4534,38.5466,12.23,300,0.904,bicubic
 tf_efficientnet_b5,38.356,61.644,59.9128,40.0872,30.39,456,0.934,bicubic
 tf_efficientnet_b3_ap,37.0552,62.9448,57.2403,42.7597,12.23,300,0.904,bicubic
+tf_efficientnet_b2_ns,36.1827,63.8173,57.5507,42.4493,9.11,260,0.89,bicubic
 swsl_resnet18,35.8584,64.1416,58.4547,41.5453,11.69,224,0.875,bilinear
-ssl_resnext101_32x16d,34.6028,65.3972,55.9315,44.0685,194.03,224,0.875,bilinear
+ssl_resnext101_32x16d,34.6047,65.3953,55.9315,44.0685,194.03,224,0.875,bilinear
+tf_efficientnet_b1_ns,34.1567,65.8433,55.4894,44.5106,7.79,240,0.882,bicubic
 tf_efficientnet_b4,34.0643,65.9357,54.1984,45.8016,19.34,380,0.922,bicubic
 ssl_resnext101_32x8d,34.0172,65.9828,55.6014,44.3986,88.79,224,0.875,bilinear
 tf_efficientnet_b6,33.9975,66.0025,54.5442,45.4558,43.04,528,0.942,bicubic
@ -28,7 +37,7 @@ gluon_resnet152_v1d,32.734,67.266,51.0877,48.9123,60.21,224,0.875,bicubic
 tf_efficientnet_b2_ap,32.6809,67.3191,52.2392,47.7608,9.11,260,0.89,bicubic
 nasnetalarge,32.5964,67.4036,49.7789,50.2211,88.75,331,0.875,bicubic
 pnasnet5large,32.5296,67.4704,50.1916,49.8084,86.06,331,0.875,bicubic
-ens_adv_inception_resnet_v2,32.3724,67.6276,50.4274,49.5726,55.84,299,0.8975,bicubic
+ens_adv_inception_resnet_v2,32.3705,67.6295,50.4274,49.5726,55.84,299,0.8975,bicubic
 gluon_resnet152_v1s,32.3312,67.6688,50.5257,49.4743,60.32,224,0.875,bicubic
 gluon_seresnext101_64x4d,32.2054,67.7946,50.3193,49.6807,88.23,224,0.875,bicubic
 gluon_seresnext101_32x4d,32.1071,67.8929,51.237,48.763,48.96,224,0.875,bicubic
@ -56,23 +65,25 @@ senet154,30.0006,69.9994,48.034,51.966,115.09,224,0.875,bilinear
 dpn92,29.9534,70.0466,49.1619,50.8381,37.67,224,0.875,bicubic
 gluon_senet154,29.8768,70.1232,47.8944,52.1056,115.09,224,0.875,bicubic
 xception,29.865,70.135,48.6864,51.3136,22.86,299,0.8975,bicubic
-adv_inception_v3,29.8178,70.1822,47.8473,52.1527,23.83,299,0.875,bicubic
+adv_inception_v3,29.8159,70.1841,47.8473,52.1527,23.83,299,0.875,bicubic
 efficientnet_b2,29.6154,70.3846,48.7767,51.2233,9.11,260,0.875,bicubic
 gluon_xception65,29.5506,70.4494,47.5054,52.4946,39.92,299,0.875,bicubic
 resnext101_32x8d,29.4386,70.5614,48.4859,51.5141,88.79,224,0.875,bilinear
 ssl_resnet50,29.4229,70.5771,49.7809,50.2191,25.56,224,0.875,bilinear
-gluon_inception_v3,29.1242,70.8758,46.9591,53.0409,23.83,299,0.875,bicubic
+gluon_inception_v3,29.1242,70.8758,46.9571,53.0429,23.83,299,0.875,bicubic
 hrnet_w64,28.9886,71.0114,47.1418,52.8582,128.06,224,0.875,bilinear
+tf_efficientnet_b0_ns,28.9021,71.0979,49.0106,50.9894,5.29,224,0.875,bicubic
 tf_efficientnet_b1,28.8864,71.1136,47.5034,52.4966,7.79,240,0.882,bicubic
 gluon_resnet101_v1b,28.8785,71.1215,46.3892,53.6108,44.55,224,0.875,bicubic
+skresnext50_32x4d,28.8176,71.1824,46.4973,53.5027,27.48,224,0.875,bicubic
 gluon_seresnext50_32x4d,28.6506,71.3494,46.4364,53.5636,27.56,224,0.875,bicubic
 hrnet_w40,28.6408,71.3592,47.4543,52.5457,57.56,224,0.875,bilinear
 resnet152,28.5327,71.4673,47.1182,52.8818,60.19,224,0.875,bilinear
 hrnet_w48,28.4128,71.5872,47.5859,52.4141,77.47,224,0.875,bilinear
 gluon_resnext50_32x4d,28.3755,71.6245,45.3281,54.6719,25.03,224,0.875,bicubic
 tf_efficientnet_b0_ap,28.346,71.654,47.5309,52.4691,5.29,224,0.875,bicubic
+dla102x2,28.3146,71.6854,46.7606,53.2394,41.75,224,0.875,bilinear
 tf_efficientnet_cc_b0_4e,28.3146,71.6854,47.3639,52.6361,13.31,224,0.875,bicubic
-dla102x2,28.3126,71.6874,46.7606,53.2394,41.75,224,0.875,bilinear
 dla169,28.3126,71.6874,47.3914,52.6086,53.99,224,0.875,bilinear
 mixnet_xl,28.2871,71.7129,46.7016,53.2984,11.9,224,0.875,bicubic
 gluon_resnet50_v1d,28.2458,71.7542,45.8783,54.1217,25.58,224,0.875,bicubic
@ -80,22 +91,23 @@ wide_resnet101_2,28.1082,71.8918,46.401,53.599,126.89,224,0.875,bilinear
 gluon_resnet101_v1c,28.1043,71.8957,45.9608,54.0392,44.57,224,0.875,bicubic
 densenet161,28.0807,71.9193,46.6407,53.3593,28.68,224,0.875,bicubic
 dpn68b,27.8842,72.1158,47.468,52.532,12.61,224,0.875,bicubic
-tf_inception_v3,27.7801,72.2199,45.7211,54.2789,23.83,299,0.875,bicubic
+tf_inception_v3,27.782,72.218,45.7191,54.2809,23.83,299,0.875,bicubic
 res2net101_26w_4s,27.7683,72.2317,45.1787,54.8213,45.21,224,0.875,bilinear
 hrnet_w44,27.6209,72.3791,45.837,54.163,67.06,224,0.875,bilinear
-inception_v3,27.5561,72.4439,45.2652,54.7348,27.16,299,0.875,bicubic
+inception_v3,27.5561,72.4439,45.2632,54.7368,27.16,299,0.875,bicubic
 hrnet_w30,27.3812,72.6188,46.5543,53.4457,37.71,224,0.875,bilinear
 hrnet_w32,27.3694,72.6306,45.9942,54.0058,41.23,224,0.875,bilinear
 gluon_resnet50_v1s,27.3261,72.6739,45.222,54.778,25.68,224,0.875,bicubic
 densenet201,27.2652,72.7348,46.2222,53.7778,20.01,224,0.875,bicubic
 res2net50_26w_8s,27.0785,72.9215,44.4281,55.5719,48.4,224,0.875,bilinear
-dla102x,27.0609,72.9391,45.4754,54.5246,26.77,224,0.875,bilinear
+dla102x,27.0609,72.9391,45.4735,54.5265,26.77,224,0.875,bilinear
 resnet101,26.9626,73.0374,45.2337,54.7663,44.55,224,0.875,bilinear
 resnext50d_32x4d,26.8761,73.1239,44.4359,55.5641,25.05,224,0.875,bicubic
 densenet169,26.829,73.171,45.3733,54.6267,14.15,224,0.875,bicubic
 seresnext101_32x4d,26.8113,73.1887,43.4966,56.5034,48.96,224,0.875,bilinear
 seresnet152,26.6757,73.3243,43.9466,56.0534,66.82,224,0.875,bilinear
 tf_efficientnet_el,26.6226,73.3774,44.6482,55.3518,10.59,300,0.904,bicubic
+efficientnet_es,26.6207,73.3793,45.1119,54.8881,5.44,224,0.875,bicubic
 res2net50_26w_6s,26.5951,73.4049,43.9899,56.0101,37.05,224,0.875,bilinear
 dla60x,26.5519,73.4481,45.0235,54.9765,17.65,224,0.875,bilinear
 tf_efficientnet_b0,26.4851,73.5149,45.6464,54.3536,5.29,224,0.875,bicubic
@ -125,12 +137,13 @@ efficientnet_b0,25.0152,74.9848,42.7872,57.2128,5.29,224,0.875,bicubic
 gluon_resnet34_v1b,24.9386,75.0614,42.2429,57.7571,21.8,224,0.875,bicubic
 dla60,24.9327,75.0673,43.2962,56.7038,22.33,224,0.875,bilinear
 tf_efficientnet_em,24.5416,75.4584,42.4119,57.5881,6.9,240,0.882,bicubic
+skresnet18,24.4827,75.5173,42.5357,57.4643,11.96,224,0.875,bicubic
 tv_resnet50,24.07,75.93,41.3134,58.6866,25.56,224,0.875,bilinear
 seresnet34,24.0268,75.9732,41.9089,58.0911,21.96,224,0.875,bilinear
 densenet121,23.8441,76.1559,41.9246,58.0754,7.98,224,0.875,bicubic
 tf_efficientnet_es,23.8185,76.1815,41.3311,58.6689,5.44,224,0.875,bicubic
 mixnet_m,23.7104,76.2896,41.1405,58.8595,5.01,224,0.875,bicubic
-dla34,23.6692,76.3308,41.5512,58.4488,15.78,224,0.875,bilinear
+dla34,23.6692,76.3308,41.5532,58.4468,15.78,224,0.875,bilinear
 seresnet50,23.6515,76.3485,40.0912,59.9088,28.09,224,0.875,bilinear
 tf_mixnet_m,23.4844,76.5156,40.9892,59.0108,5.01,224,0.875,bicubic
 tv_resnet34,23.4727,76.5273,41.3665,58.6335,21.8,224,0.875,bilinear
--- a/sotabench.py
+++ b/sotabench.py
@ -56,8 +56,7 @@ model_list = [
           model_desc='Trained from scratch in PyTorch w/ RandAugment'),
    _entry('efficientnet_es', 'EfficientNet-EdgeTPU-S', '1905.11946',
           model_desc='Trained from scratch in PyTorch w/ RandAugment'),
-    _entry('fbnetc_100', 'FBNet-C', '1812.03443',
-           model_desc='Trained in PyTorch with RMSProp, exponential LR decay'),
+
    _entry('gluon_inception_v3', 'Inception V3', '1512.00567', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_resnet18_v1b', 'ResNet-18', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_resnet34_v1b', 'ResNet-34', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
@ -82,14 +81,22 @@ model_list = [
    _entry('gluon_seresnext101_64x4d', 'SE-ResNeXt-101 64x4d', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_xception65', 'Modified Aligned Xception', '1802.02611', batch_size=BATCH_SIZE//2,
           model_desc='Ported from GluonCV Model Zoo'),
+
    _entry('mixnet_xl', 'MixNet-XL', '1907.09595', model_desc="My own scaling beyond paper's MixNet Large"),
    _entry('mixnet_l', 'MixNet-L', '1907.09595'),
    _entry('mixnet_m', 'MixNet-M', '1907.09595'),
    _entry('mixnet_s', 'MixNet-S', '1907.09595'),
+
+    _entry('fbnetc_100', 'FBNet-C', '1812.03443',
+           model_desc='Trained in PyTorch with RMSProp, exponential LR decay'),
    _entry('mnasnet_100', 'MnasNet-B1', '1807.11626'),
+    _entry('semnasnet_100', 'MnasNet-A1', '1807.11626'),
+    _entry('spnasnet_100', 'Single-Path NAS', '1904.02877',
+           model_desc='Trained in PyTorch with SGD, cosine LR decay'),
    _entry('mobilenetv3_rw', 'MobileNet V3-Large 1.0', '1905.02244',
           model_desc='Trained in PyTorch with RMSProp, exponential LR decay, and hyper-params matching '
                      'paper as closely as possible.'),
+
    _entry('resnet18', 'ResNet-18', '1812.01187'),
    _entry('resnet26', 'ResNet-26', '1812.01187', model_desc='Block cfg of ResNet-34 w/ Bottleneck'),
    _entry('resnet26d', 'ResNet-26-D', '1812.01187',
@ -103,7 +110,7 @@ model_list = [
    _entry('resnext50d_32x4d', 'ResNeXt-50-D 32x4d', '1812.01187',
           model_desc="'D' variant (3x3 deep stem w/ avg-pool downscale). Trained with "
                      "SGD w/ cosine LR decay, random-erasing (gaussian per-pixel noise) and label-smoothing"),
-    _entry('semnasnet_100', 'MnasNet-A1', '1807.11626'),
+
    _entry('seresnet18', 'SE-ResNet-18', '1709.01507'),
    _entry('seresnet34', 'SE-ResNet-34', '1709.01507'),
    _entry('seresnext26_32x4d', 'SE-ResNeXt-26 32x4d', '1709.01507',
@ -114,8 +121,9 @@ model_list = [
           model_desc='Block cfg of SE-ResNeXt-34 w/ Bottleneck, deep tiered stem, and avg-pool in downsample layers.'),
    _entry('seresnext26tn_32x4d', 'SE-ResNeXt-26-TN 32x4d', '1812.01187',
           model_desc='Block cfg of SE-ResNeXt-34 w/ Bottleneck, deep tiered narrow stem, and avg-pool in downsample layers.'),
-    _entry('spnasnet_100', 'Single-Path NAS', '1904.02877',
-           model_desc='Trained in PyTorch with SGD, cosine LR decay'),
+
+    _entry('skresnet18', 'SK-ResNet-18', '1903.06586'),
+    _entry('skresnext50_32x4d', 'SKNet-50', '1903.06586'),

    _entry('tf_efficientnet_b0', 'EfficientNet-B0 (AutoAugment)', '1905.11946',
           model_desc='Ported from official Google AI Tensorflow weights'),
--- a/timm/models/init.py
+++ b/timm/models/init.py
@ -16,9 +16,10 @@ from .gluon_xception import *
 from .res2net import *
 from .dla import *
 from .hrnet import *
+from .sknet import *

 from .registry import *
 from .factory import create_model
 from .helpers import load_checkpoint, resume_checkpoint
-from .test_time_pool import TestTimePoolHead, apply_test_time_pool
-from .split_batchnorm import convert_splitbn_model
+from .layers import TestTimePoolHead, apply_test_time_pool
+from .layers import convert_splitbn_model
--- a/timm/models/conv2d_layers.py
+++ b/timm/models/conv2d_layers.py
@ -1,260 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch._six import container_abcs
-from itertools import repeat
-from functools import partial
-import numpy as np
-import math
-
-
-# Tuple helpers ripped from PyTorch
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, container_abcs.Iterable):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-
-
-_single = _ntuple(1)
-_pair = _ntuple(2)
-_triple = _ntuple(3)
-_quadruple = _ntuple(4)
-
-
-def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
-    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
-
-
-def _get_padding(kernel_size, stride=1, dilation=1, **_):
-    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
-    return padding
-
-
-def _calc_same_pad(i, k, s, d):
-    return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
-
-
-def _split_channels(num_chan, num_groups):
-    split = [num_chan // num_groups for _ in range(num_groups)]
-    split[0] += num_chan - sum(split)
-    return split
-
-
-# pylint: disable=unused-argument
-def conv2d_same(x, weight, bias=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1):
-    ih, iw = x.size()[-2:]
-    kh, kw = weight.size()[-2:]
-    pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
-    pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
-    if pad_h > 0 or pad_w > 0:
-        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
-    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
-
-
-class Conv2dSame(nn.Conv2d):
-    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
-    """
-
-    # pylint: disable=unused-argument
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        super(Conv2dSame, self).__init__(
-            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
-
-    def forward(self, x):
-        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
-
-
-def get_padding_value(padding, kernel_size, **kwargs):
-    dynamic = False
-    if isinstance(padding, str):
-        # for any string padding, the padding will be calculated for you, one of three ways
-        padding = padding.lower()
-        if padding == 'same':
-            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
-            if _is_static_pad(kernel_size, **kwargs):
-                # static case, no extra overhead
-                padding = _get_padding(kernel_size, **kwargs)
-            else:
-                # dynamic 'SAME' padding, has runtime/GPU memory overhead
-                padding = 0
-                dynamic = True
-        elif padding == 'valid':
-            # 'VALID' padding, same as padding=0
-            padding = 0
-        else:
-            # Default to PyTorch style 'same'-ish symmetric padding
-            padding = _get_padding(kernel_size, **kwargs)
-    return padding, dynamic
-
-
-def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
-    padding = kwargs.pop('padding', '')
-    kwargs.setdefault('bias', False)
-    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
-    if is_dynamic:
-        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
-    else:
-        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
-
-
-class MixedConv2d(nn.Module):
-    """ Mixed Grouped Convolution
-    Based on MDConv and GroupedConv in MixNet impl:
-      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
-
-    NOTE: This does not currently work with torch.jit.script
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
-        super(MixedConv2d, self).__init__()
-
-        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
-        num_groups = len(kernel_size)
-        in_splits = _split_channels(in_channels, num_groups)
-        out_splits = _split_channels(out_channels, num_groups)
-        self.in_channels = sum(in_splits)
-        self.out_channels = sum(out_splits)
-        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
-            conv_groups = out_ch if depthwise else 1
-            # use add_module to keep key space clean
-            self.add_module(
-                str(idx),
-                create_conv2d_pad(
-                    in_ch, out_ch, k, stride=stride,
-                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
-            )
-        self.splits = in_splits
-
-    def forward(self, x):
-        x_split = torch.split(x, self.splits, 1)
-        x_out = [c(x) for x, c in zip(x_split, self._modules.values())]
-        x = torch.cat(x_out, 1)
-        return x
-
-
-def get_condconv_initializer(initializer, num_experts, expert_shape):
-    def condconv_initializer(weight):
-        """CondConv initializer function."""
-        num_params = np.prod(expert_shape)
-        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
-                weight.shape[1] != num_params):
-            raise (ValueError(
-                'CondConv variables must have shape [num_experts, num_params]'))
-        for i in range(num_experts):
-            initializer(weight[i].view(expert_shape))
-    return condconv_initializer
-
-
-class CondConv2d(nn.Module):
-    """ Conditional Convolution
-    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
-
-    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
-    https://github.com/pytorch/pytorch/issues/17983
-    """
-    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
-
-    def __init__(self, in_channels, out_channels, kernel_size=3,
-                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
-        super(CondConv2d, self).__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = _pair(stride)
-        padding_val, is_padding_dynamic = get_padding_value(
-            padding, kernel_size, stride=stride, dilation=dilation)
-        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
-        self.padding = _pair(padding_val)
-        self.dilation = _pair(dilation)
-        self.groups = groups
-        self.num_experts = num_experts
-
-        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight_num_param = 1
-        for wd in self.weight_shape:
-            weight_num_param *= wd
-        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
-
-        if bias:
-            self.bias_shape = (self.out_channels,)
-            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
-        else:
-            self.register_parameter('bias', None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        init_weight = get_condconv_initializer(
-            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
-        init_weight(self.weight)
-        if self.bias is not None:
-            fan_in = np.prod(self.weight_shape[1:])
-            bound = 1 / math.sqrt(fan_in)
-            init_bias = get_condconv_initializer(
-                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
-            init_bias(self.bias)
-
-    def forward(self, x, routing_weights):
-        B, C, H, W = x.shape
-        weight = torch.matmul(routing_weights, self.weight)
-        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
-        weight = weight.view(new_weight_shape)
-        bias = None
-        if self.bias is not None:
-            bias = torch.matmul(routing_weights, self.bias)
-            bias = bias.view(B * self.out_channels)
-        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
-        x = x.view(1, B * C, H, W)
-        if self.dynamic_padding:
-            out = conv2d_same(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        else:
-            out = F.conv2d(
-                x, weight, bias, stride=self.stride, padding=self.padding,
-                dilation=self.dilation, groups=self.groups * B)
-        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
-
-        # Literal port (from TF definition)
-        # x = torch.split(x, 1, 0)
-        # weight = torch.split(weight, 1, 0)
-        # if self.bias is not None:
-        #     bias = torch.matmul(routing_weights, self.bias)
-        #     bias = torch.split(bias, 1, 0)
-        # else:
-        #     bias = [None] * B
-        # out = []
-        # for xi, wi, bi in zip(x, weight, bias):
-        #     wi = wi.view(*self.weight_shape)
-        #     if bi is not None:
-        #         bi = bi.view(*self.bias_shape)
-        #     out.append(self.conv_fn(
-        #         xi, wi, bi, stride=self.stride, padding=self.padding,
-        #         dilation=self.dilation, groups=self.groups))
-        # out = torch.cat(out, 0)
-        return out
-
-
-# helper method
-def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
-    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
-    if isinstance(kernel_size, list):
-        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
-        # We're going to use only lists for defining the MixedConv2d kernel groups,
-        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
-        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
-    else:
-        depthwise = kwargs.pop('depthwise', False)
-        groups = out_chs if depthwise else 1
-        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
-            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
-        else:
-            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
-    return m
-
-
--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@ -10,7 +10,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 import re

--- a/timm/models/dla.py
+++ b/timm/models/dla.py
@ -13,7 +13,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD


--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@ -16,7 +16,7 @@ from collections import OrderedDict

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD


--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -28,8 +28,8 @@ from .efficientnet_builder import *
 from .feature_hooks import FeatureHooks
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
-from .conv2d_layers import select_conv2d
+from .layers import SelectAdaptivePool2d
+from timm.models.layers import create_conv2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD


@ -194,7 +194,7 @@ default_cfgs = {
        input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936),
    'tf_efficientnet_l2_ns': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
-        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
+        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96),
    'tf_efficientnet_es': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
@ -253,7 +253,7 @@ class EfficientNet(nn.Module):

    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(EfficientNet, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -265,21 +265,21 @@ class EfficientNet(nn.Module):

        # Stem
        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size

        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, 32, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs

        # Head + Pooling
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
+        self.conv_head = create_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
        self.act2 = act_layer(inplace=True)
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
@ -333,7 +333,7 @@ class EfficientNetFeatures(nn.Module):

    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(EfficientNetFeatures, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -347,7 +347,7 @@ class EfficientNetFeatures(nn.Module):

        # Stem
        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -355,7 +355,7 @@ class EfficientNetFeatures(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
        self._in_chs = builder.in_chs
@ -875,7 +875,7 @@ def spnasnet_100(pretrained=False, **kwargs):
@register_model
 def efficientnet_b0(pretrained=False, **kwargs):
    """ EfficientNet-B0 """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
    return model
@ -884,7 +884,7 @@ def efficientnet_b0(pretrained=False, **kwargs):
@register_model
 def efficientnet_b1(pretrained=False, **kwargs):
    """ EfficientNet-B1 """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
    return model
@ -893,7 +893,7 @@ def efficientnet_b1(pretrained=False, **kwargs):
@register_model
 def efficientnet_b2(pretrained=False, **kwargs):
    """ EfficientNet-B2 """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
    return model
@ -902,7 +902,7 @@ def efficientnet_b2(pretrained=False, **kwargs):
@register_model
 def efficientnet_b2a(pretrained=False, **kwargs):
    """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop"""
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b2a', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
    return model
@ -911,7 +911,7 @@ def efficientnet_b2a(pretrained=False, **kwargs):
@register_model
 def efficientnet_b3(pretrained=False, **kwargs):
    """ EfficientNet-B3 """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
    return model
@ -920,7 +920,7 @@ def efficientnet_b3(pretrained=False, **kwargs):
@register_model
 def efficientnet_b3a(pretrained=False, **kwargs):
    """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b3a', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
    return model
@ -929,7 +929,7 @@ def efficientnet_b3a(pretrained=False, **kwargs):
@register_model
 def efficientnet_b4(pretrained=False, **kwargs):
    """ EfficientNet-B4 """
-    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
    return model
@ -938,7 +938,7 @@ def efficientnet_b4(pretrained=False, **kwargs):
@register_model
 def efficientnet_b5(pretrained=False, **kwargs):
    """ EfficientNet-B5 """
-    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
    return model
@ -947,7 +947,7 @@ def efficientnet_b5(pretrained=False, **kwargs):
@register_model
 def efficientnet_b6(pretrained=False, **kwargs):
    """ EfficientNet-B6 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
    return model
@ -956,7 +956,7 @@ def efficientnet_b6(pretrained=False, **kwargs):
@register_model
 def efficientnet_b7(pretrained=False, **kwargs):
    """ EfficientNet-B7 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
    return model
@ -965,7 +965,7 @@ def efficientnet_b7(pretrained=False, **kwargs):
@register_model
 def efficientnet_b8(pretrained=False, **kwargs):
    """ EfficientNet-B8 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
    return model
@ -974,7 +974,7 @@ def efficientnet_b8(pretrained=False, **kwargs):
@register_model
 def efficientnet_l2(pretrained=False, **kwargs):
    """ EfficientNet-L2."""
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
    return model
@ -1007,7 +1007,7 @@ def efficientnet_el(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
    return model
@ -1016,7 +1016,7 @@ def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
        pretrained=pretrained, **kwargs)
@ -1025,7 +1025,7 @@ def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
        pretrained=pretrained, **kwargs)
@ -1355,7 +1355,7 @@ def tf_efficientnet_el(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
@ -1366,7 +1366,7 @@ def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
@ -1377,7 +1377,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -1,11 +1,8 @@
-
-from functools import partial
-
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from .activations import sigmoid
-from .conv2d_layers import *
+from torch.nn import functional as F
+from .layers.activations import sigmoid
+from .layers import create_conv2d, drop_path


 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
@ -72,19 +69,6 @@ def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
    return make_divisible(channels, divisor, channel_min)


-def drop_connect(inputs, training=False, drop_connect_rate=0.):
-    """Apply drop connect."""
-    if not training:
-        return inputs
-
-    keep_prob = 1 - drop_connect_rate
-    random_tensor = keep_prob + torch.rand(
-        (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
-    random_tensor.floor_()  # binarize
-    output = inputs.div(keep_prob) * random_tensor
-    return output
-
-
 class ChannelShuffle(nn.Module):
    # FIXME haven't used yet
    def __init__(self, groups):
@ -132,7 +116,7 @@ class ConvBnAct(nn.Module):
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(ConvBnAct, self).__init__()
        norm_kwargs = norm_kwargs or {}
-        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
        self.bn1 = norm_layer(out_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

@ -157,25 +141,27 @@ class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0.):
        super(DepthwiseSeparableConv, self).__init__()
        norm_kwargs = norm_kwargs or {}
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
        self.has_pw_act = pw_act  # activation after point-wise conv
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate

-        self.conv_dw = select_conv2d(
+        self.conv_dw = create_conv2d(
            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
        self.bn1 = norm_layer(in_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None

-        self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
        self.bn2 = norm_layer(out_chs, **norm_kwargs)
        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()

@ -193,7 +179,7 @@ class DepthwiseSeparableConv(nn.Module):
        x = self.bn1(x)
        x = self.act1(x)

-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)

        x = self.conv_pw(x)
@ -201,8 +187,8 @@ class DepthwiseSeparableConv(nn.Module):
        x = self.act2(x)

        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x

@ -214,34 +200,36 @@ class InvertedResidual(nn.Module):
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 conv_kwargs=None, drop_connect_rate=0.):
+                 conv_kwargs=None, drop_path_rate=0.):
        super(InvertedResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
        conv_kwargs = conv_kwargs or {}
        mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate

        # Point-wise expansion
-        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

        # Depth-wise convolution
-        self.conv_dw = select_conv2d(
+        self.conv_dw = create_conv2d(
            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
            padding=pad_type, depthwise=True, **conv_kwargs)
        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
        self.act2 = act_layer(inplace=True)

        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None

        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
        self.bn3 = norm_layer(out_chs, **norm_kwargs)

    def feature_module(self, location):
@ -269,7 +257,7 @@ class InvertedResidual(nn.Module):
        x = self.act2(x)

        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)

        # Point-wise linear projection
@ -277,8 +265,8 @@ class InvertedResidual(nn.Module):
        x = self.bn3(x)

        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual

        return x
@ -291,7 +279,7 @@ class CondConvResidual(InvertedResidual):
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 num_experts=0, drop_connect_rate=0.):
+                 num_experts=0, drop_path_rate=0.):

        self.num_experts = num_experts
        conv_kwargs = dict(num_experts=self.num_experts)
@ -301,7 +289,7 @@ class CondConvResidual(InvertedResidual):
            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
-            drop_connect_rate=drop_connect_rate)
+            drop_path_rate=drop_path_rate)

        self.routing_fn = nn.Linear(in_chs, self.num_experts)

@ -323,7 +311,7 @@ class CondConvResidual(InvertedResidual):
        x = self.act2(x)

        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)

        # Point-wise linear projection
@ -331,8 +319,8 @@ class CondConvResidual(InvertedResidual):
        x = self.bn3(x)

        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x

@ -343,29 +331,31 @@ class EdgeResidual(nn.Module):
    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 drop_connect_rate=0.):
+                 drop_path_rate=0.):
        super(EdgeResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
        if fake_in_chs > 0:
            mid_chs = make_divisible(fake_in_chs * exp_ratio)
        else:
            mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate

        # Expansion convolution
-        self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+        self.conv_exp = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)

        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None

        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(
+        self.conv_pwl = create_conv2d(
            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
        self.bn2 = norm_layer(out_chs, **norm_kwargs)

@ -389,7 +379,7 @@ class EdgeResidual(nn.Module):
        x = self.act1(x)

        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)

        # Point-wise linear projection
@ -397,8 +387,8 @@ class EdgeResidual(nn.Module):
        x = self.bn2(x)

        if self.has_residual:
-            if self.drop_connect_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual

        return x
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -5,7 +5,8 @@ from collections.__init__ import OrderedDict
 from copy import deepcopy

 import torch.nn as nn
-from .activations import sigmoid, HardSwish, Swish
+from .layers import CondConv2d, get_condconv_initializer
+from .layers.activations import HardSwish, Swish
 from .efficientnet_blocks import *


@ -201,7 +202,7 @@ class EfficientNetBuilder:
    """
    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0., feature_location='',
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0., feature_location='',
                 verbose=False):
        self.channel_multiplier = channel_multiplier
        self.channel_divisor = channel_divisor
@ -212,7 +213,7 @@ class EfficientNetBuilder:
        self.se_kwargs = se_kwargs
        self.norm_layer = norm_layer
        self.norm_kwargs = norm_kwargs
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate
        self.feature_location = feature_location
        assert feature_location in ('pre_pwl', 'post_exp', '')
        self.verbose = verbose
@ -225,7 +226,7 @@ class EfficientNetBuilder:
        return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)

    def _make_block(self, ba, block_idx, block_count):
-        drop_connect_rate = self.drop_connect_rate * block_idx / block_count
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
        bt = ba.pop('block_type')
        ba['in_chs'] = self.in_chs
        ba['out_chs'] = self._round_channels(ba['out_chs'])
@ -239,7 +240,7 @@ class EfficientNetBuilder:
        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
        assert ba['act_layer'] is not None
        if bt == 'ir':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
@ -248,13 +249,13 @@ class EfficientNetBuilder:
            else:
                block = InvertedResidual(**ba)
        elif bt == 'ds' or bt == 'dsa':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
            block = DepthwiseSeparableConv(**ba)
        elif bt == 'er':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@ -31,7 +31,21 @@ def create_model(
        kwargs.pop('bn_tf', None)
        kwargs.pop('bn_momentum', None)
        kwargs.pop('bn_eps', None)
-        kwargs.pop('drop_connect_rate', None)
+
+    # Parameters that aren't supported by all models should default to None in command line args,
+    # remove them if they are present and not set so that non-supporting models don't break.
+    if kwargs.get('drop_block_rate', None) is None:
+        kwargs.pop('drop_block_rate', None)
+
+    # handle backwards compat with drop_connect -> drop_path change
+    drop_connect_rate = kwargs.pop('drop_connect_rate', None)
+    if drop_connect_rate is not None and kwargs.get('drop_path_rate', None) is None:
+        print("WARNING: 'drop_connect' as an argument is deprecated, please use 'drop_path'."
+              " Setting drop_path to %f." % drop_connect_rate)
+        kwargs['drop_path_rate'] = drop_connect_rate
+
+    if kwargs.get('drop_path_rate', None) is None:
+        kwargs.pop('drop_path_rate', None)

    if is_model(model_name):
        create_fn = model_entrypoint(model_name)
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@ -11,6 +11,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
+from .layers import SEModule
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 from .resnet import ResNet, Bottleneck, BasicBlock
@ -319,8 +320,8 @@ def gluon_seresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kw
    """
    default_cfg = default_cfgs['gluon_seresnext50_32x4d']
    model = ResNet(
-        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -333,8 +334,8 @@ def gluon_seresnext101_32x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    """
    default_cfg = default_cfgs['gluon_seresnext101_32x4d']
    model = ResNet(
-        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -346,9 +347,10 @@ def gluon_seresnext101_64x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    """Constructs a SEResNeXt-101-64x4d model.
    """
    default_cfg = default_cfgs['gluon_seresnext101_64x4d']
+    block_args = dict(attn_layer=SEModule)
    model = ResNet(
-        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4,
+        num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -360,10 +362,10 @@ def gluon_senet154(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs an SENet-154 model.
    """
    default_cfg = default_cfgs['gluon_senet154']
+    block_args = dict(attn_layer=SEModule)
    model = ResNet(
-        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, use_se=True,
-        stem_type='deep', down_kernel_size=3, block_reduce_first=2,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep', down_kernel_size=3,
+        block_reduce_first=2, num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
--- a/timm/models/gluon_xception.py
+++ b/timm/models/gluon_xception.py
@ -13,7 +13,7 @@ from collections import OrderedDict

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 __all__ = ['Xception65', 'Xception71']
--- a/timm/models/hrnet.py
+++ b/timm/models/hrnet.py
@ -25,7 +25,7 @@ import torch.nn.functional as F
 from .resnet import BasicBlock, Bottleneck  # leveraging ResNet blocks w/ additional features like SE
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 _BN_MOMENTUM = 0.1
--- a/timm/models/inception_resnet_v2.py
+++ b/timm/models/inception_resnet_v2.py
@ -8,7 +8,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD

 __all__ = ['InceptionResnetV2']
--- a/timm/models/inception_v4.py
+++ b/timm/models/inception_v4.py
@ -8,7 +8,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD

 __all__ = ['InceptionV4']
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -0,0 +1,17 @@
+from .padding import get_padding
+from .avg_pool2d_same import AvgPool2dSame
+from .conv2d_same import Conv2dSame
+from .conv_bn_act import ConvBnAct
+from .mixed_conv2d import MixedConv2d
+from .cond_conv2d import CondConv2d, get_condconv_initializer
+from .create_conv2d import create_conv2d
+from .create_attn import create_attn
+from .selective_kernel import SelectiveKernelConv
+from .se import SEModule
+from .eca import EcaModule, CecaModule
+from .activations import *
+from .adaptive_avgmax_pool import \
+    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
+from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
+from .test_time_pool import TestTimePoolHead, apply_test_time_pool
+from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
--- a/timm/models/layers/activations.py
+++ b/timm/models/layers/activations.py
@ -1,3 +1,12 @@
+""" Activations
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+Hacked together by Ross Wightman
+"""
+
+
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@ -66,20 +75,20 @@ if _USE_MEM_EFFICIENT_ISH:
        return MishJitAutoFn.apply(x)

 else:
-    def swish(x, inplace=False):
+    def swish(x, inplace: bool = False):
        """Swish - Described in: https://arxiv.org/abs/1710.05941
        """
        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())


-    def mish(x, _inplace=False):
+    def mish(x, _inplace: bool = False):
        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
        """
        return x.mul(F.softplus(x).tanh())


 class Swish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Swish, self).__init__()
        self.inplace = inplace

@ -88,7 +97,7 @@ class Swish(nn.Module):


 class Mish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Mish, self).__init__()
        self.inplace = inplace

@ -96,13 +105,13 @@ class Mish(nn.Module):
        return mish(x, self.inplace)


-def sigmoid(x, inplace=False):
+def sigmoid(x, inplace: bool = False):
    return x.sigmoid_() if inplace else x.sigmoid()


 # PyTorch has this, but not with a consistent inplace argmument interface
 class Sigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Sigmoid, self).__init__()
        self.inplace = inplace

@ -110,13 +119,13 @@ class Sigmoid(nn.Module):
        return x.sigmoid_() if self.inplace else x.sigmoid()


-def tanh(x, inplace=False):
+def tanh(x, inplace: bool = False):
    return x.tanh_() if inplace else x.tanh()


 # PyTorch has this, but not with a consistent inplace argmument interface
 class Tanh(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Tanh, self).__init__()
        self.inplace = inplace

@ -124,13 +133,13 @@ class Tanh(nn.Module):
        return x.tanh_() if self.inplace else x.tanh()


-def hard_swish(x, inplace=False):
+def hard_swish(x, inplace: bool = False):
    inner = F.relu6(x + 3.).div_(6.)
    return x.mul_(inner) if inplace else x.mul(inner)


 class HardSwish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(HardSwish, self).__init__()
        self.inplace = inplace

@ -138,7 +147,7 @@ class HardSwish(nn.Module):
        return hard_swish(x, self.inplace)


-def hard_sigmoid(x, inplace=False):
+def hard_sigmoid(x, inplace: bool = False):
    if inplace:
        return x.add_(3.).clamp_(0., 6.).div_(6.)
    else:
@ -146,7 +155,7 @@ def hard_sigmoid(x, inplace=False):


 class HardSigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(HardSigmoid, self).__init__()
        self.inplace = inplace

--- a/timm/models/layers/adaptive_avgmax_pool.py
+++ b/timm/models/layers/adaptive_avgmax_pool.py
--- a/timm/models/layers/avg_pool2d_same.py
+++ b/timm/models/layers/avg_pool2d_same.py
@ -0,0 +1,31 @@
+""" AvgPool2d w/ Same Padding
+
+Hacked together by Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List
+import math
+
+from .helpers import tup_pair
+from .padding import pad_same
+
+
+def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
+                    ceil_mode: bool = False, count_include_pad: bool = True):
+    x = pad_same(x, kernel_size, stride)
+    return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
+
+
+class AvgPool2dSame(nn.AvgPool2d):
+    """ Tensorflow like 'SAME' wrapper for 2D average pooling
+    """
+    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+        kernel_size = tup_pair(kernel_size)
+        stride = tup_pair(stride)
+        super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
+
+    def forward(self, x):
+        return avg_pool2d_same(
+            x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
--- a/timm/models/layers/cbam.py
+++ b/timm/models/layers/cbam.py
@ -0,0 +1,100 @@
+""" CBAM (sort-of) Attention
+
+Experimental impl of CBAM: Convolutional Block Attention Module: https://arxiv.org/abs/1807.06521
+
+WARNING: Results with these attention layers have been mixed. They can significantly reduce performance on
+some tasks, especially fine-grained it seems. I may end up removing this impl.
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from .conv_bn_act import ConvBnAct
+
+
+class ChannelAttn(nn.Module):
+    """ Original CBAM channel attention module, currently avg + max pool variant only.
+    """
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
+        super(ChannelAttn, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.fc1 = nn.Conv2d(channels, channels // reduction, 1, bias=False)
+        self.act = act_layer(inplace=True)
+        self.fc2 = nn.Conv2d(channels // reduction, channels, 1, bias=False)
+
+    def forward(self, x):
+        x_avg = self.avg_pool(x)
+        x_max = self.max_pool(x)
+        x_avg = self.fc2(self.act(self.fc1(x_avg)))
+        x_max = self.fc2(self.act(self.fc1(x_max)))
+        x_attn = x_avg + x_max
+        return x * x_attn.sigmoid()
+
+
+class LightChannelAttn(ChannelAttn):
+    """An experimental 'lightweight' that sums avg + max pool first
+    """
+    def __init__(self, channels, reduction=16):
+        super(LightChannelAttn, self).__init__(channels, reduction)
+
+    def forward(self, x):
+        x_pool = 0.5 * self.avg_pool(x) + 0.5 * self.max_pool(x)
+        x_attn = self.fc2(self.act(self.fc1(x_pool)))
+        return x * x_attn.sigmoid()
+
+
+class SpatialAttn(nn.Module):
+    """ Original CBAM spatial attention module
+    """
+    def __init__(self, kernel_size=7):
+        super(SpatialAttn, self).__init__()
+        self.conv = ConvBnAct(2, 1, kernel_size, act_layer=None)
+
+    def forward(self, x):
+        x_avg = torch.mean(x, dim=1, keepdim=True)
+        x_max = torch.max(x, dim=1, keepdim=True)[0]
+        x_attn = torch.cat([x_avg, x_max], dim=1)
+        x_attn = self.conv(x_attn)
+        return x * x_attn.sigmoid()
+
+
+class LightSpatialAttn(nn.Module):
+    """An experimental 'lightweight' variant that sums avg_pool and max_pool results.
+    """
+    def __init__(self, kernel_size=7):
+        super(LightSpatialAttn, self).__init__()
+        self.conv = ConvBnAct(1, 1, kernel_size, act_layer=None)
+
+    def forward(self, x):
+        x_avg = torch.mean(x, dim=1, keepdim=True)
+        x_max = torch.max(x, dim=1, keepdim=True)[0]
+        x_attn = 0.5 * x_avg + 0.5 * x_max
+        x_attn = self.conv(x_attn)
+        return x * x_attn.sigmoid()
+
+
+class CbamModule(nn.Module):
+    def __init__(self, channels, spatial_kernel_size=7):
+        super(CbamModule, self).__init__()
+        self.channel = ChannelAttn(channels)
+        self.spatial = SpatialAttn(spatial_kernel_size)
+
+    def forward(self, x):
+        x = self.channel(x)
+        x = self.spatial(x)
+        return x
+
+
+class LightCbamModule(nn.Module):
+    def __init__(self, channels, spatial_kernel_size=7):
+        super(LightCbamModule, self).__init__()
+        self.channel = LightChannelAttn(channels)
+        self.spatial = LightSpatialAttn(spatial_kernel_size)
+
+    def forward(self, x):
+        x = self.channel(x)
+        x = self.spatial(x)
+        return x
+
--- a/timm/models/layers/cond_conv2d.py
+++ b/timm/models/layers/cond_conv2d.py
@ -0,0 +1,121 @@
+""" PyTorch Conditionally Parameterized Convolution (CondConv)
+
+Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
+(https://arxiv.org/abs/1904.04971)
+
+Hacked together by Ross Wightman
+"""
+
+import math
+from functools import partial
+import numpy as np
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .helpers import tup_pair
+from .conv2d_same import get_padding_value, conv2d_same
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+    def condconv_initializer(weight):
+        """CondConv initializer function."""
+        num_params = np.prod(expert_shape)
+        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+                weight.shape[1] != num_params):
+            raise (ValueError(
+                'CondConv variables must have shape [num_experts, num_params]'))
+        for i in range(num_experts):
+            initializer(weight[i].view(expert_shape))
+    return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+    """ Conditionally Parameterized Convolution
+    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+    https://github.com/pytorch/pytorch/issues/17983
+    """
+    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+        super(CondConv2d, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = tup_pair(kernel_size)
+        self.stride = tup_pair(stride)
+        padding_val, is_padding_dynamic = get_padding_value(
+            padding, kernel_size, stride=stride, dilation=dilation)
+        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
+        self.padding = tup_pair(padding_val)
+        self.dilation = tup_pair(dilation)
+        self.groups = groups
+        self.num_experts = num_experts
+
+        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight_num_param = 1
+        for wd in self.weight_shape:
+            weight_num_param *= wd
+        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+        if bias:
+            self.bias_shape = (self.out_channels,)
+            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
+        else:
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init_weight = get_condconv_initializer(
+            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+        init_weight(self.weight)
+        if self.bias is not None:
+            fan_in = np.prod(self.weight_shape[1:])
+            bound = 1 / math.sqrt(fan_in)
+            init_bias = get_condconv_initializer(
+                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+            init_bias(self.bias)
+
+    def forward(self, x, routing_weights):
+        B, C, H, W = x.shape
+        weight = torch.matmul(routing_weights, self.weight)
+        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight = weight.view(new_weight_shape)
+        bias = None
+        if self.bias is not None:
+            bias = torch.matmul(routing_weights, self.bias)
+            bias = bias.view(B * self.out_channels)
+        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+        x = x.view(1, B * C, H, W)
+        if self.dynamic_padding:
+            out = conv2d_same(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        else:
+            out = F.conv2d(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+        # Literal port (from TF definition)
+        # x = torch.split(x, 1, 0)
+        # weight = torch.split(weight, 1, 0)
+        # if self.bias is not None:
+        #     bias = torch.matmul(routing_weights, self.bias)
+        #     bias = torch.split(bias, 1, 0)
+        # else:
+        #     bias = [None] * B
+        # out = []
+        # for xi, wi, bi in zip(x, weight, bias):
+        #     wi = wi.view(*self.weight_shape)
+        #     if bi is not None:
+        #         bi = bi.view(*self.bias_shape)
+        #     out.append(self.conv_fn(
+        #         xi, wi, bi, stride=self.stride, padding=self.padding,
+        #         dilation=self.dilation, groups=self.groups))
+        # out = torch.cat(out, 0)
+        return out
--- a/timm/models/layers/conv2d_same.py
+++ b/timm/models/layers/conv2d_same.py
@ -0,0 +1,66 @@
+""" Conv2d w/ Same Padding
+
+Hacked together by Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Union, List, Tuple, Optional, Callable
+import math
+
+from .padding import get_padding, pad_same, is_static_pad
+
+
+def conv2d_same(
+        x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
+        padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
+    x = pad_same(x, weight.shape[-2:], stride, dilation)
+    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+
+
+class Conv2dSame(nn.Conv2d):
+    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super(Conv2dSame, self).__init__(
+            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+
+    def forward(self, x):
+        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', '')
+    kwargs.setdefault('bias', False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
--- a/timm/models/layers/conv_bn_act.py
+++ b/timm/models/layers/conv_bn_act.py
@ -0,0 +1,32 @@
+""" Conv2d + BN + Act
+
+Hacked together by Ross Wightman
+"""
+from torch import nn as nn
+
+from timm.models.layers import get_padding
+
+
+class ConvBnAct(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1,
+                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        super(ConvBnAct, self).__init__()
+        padding = get_padding(kernel_size, stride, dilation)  # assuming PyTorch style padding for this block
+        self.conv = nn.Conv2d(
+            in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+            padding=padding, dilation=dilation, groups=groups, bias=False)
+        self.bn = norm_layer(out_channels)
+        self.drop_block = drop_block
+        if act_layer is not None:
+            self.act = act_layer(inplace=True)
+        else:
+            self.act = None
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -0,0 +1,35 @@
+""" Select AttentionFactory Method
+
+Hacked together by Ross Wightman
+"""
+import torch
+from .se import SEModule
+from .eca import EcaModule, CecaModule
+from .cbam import CbamModule, LightCbamModule
+
+
+def create_attn(attn_type, channels, **kwargs):
+    module_cls = None
+    if attn_type is not None:
+        if isinstance(attn_type, str):
+            attn_type = attn_type.lower()
+            if attn_type == 'se':
+                module_cls = SEModule
+            elif attn_type == 'eca':
+                module_cls = EcaModule
+            elif attn_type == 'eca':
+                module_cls = CecaModule
+            elif attn_type == 'cbam':
+                module_cls = CbamModule
+            elif attn_type == 'lcbam':
+                module_cls = LightCbamModule
+            else:
+                assert False, "Invalid attn module (%s)" % attn_type
+        elif isinstance(attn_type, bool):
+            if attn_type:
+                module_cls = SEModule
+        else:
+            module_cls = attn_type
+    if module_cls is not None:
+        return module_cls(channels, **kwargs)
+    return None
--- a/timm/models/layers/create_conv2d.py
+++ b/timm/models/layers/create_conv2d.py
@ -0,0 +1,30 @@
+""" Create Conv2d Factory Method
+
+Hacked together by Ross Wightman
+"""
+
+from .mixed_conv2d import MixedConv2d
+from .cond_conv2d import CondConv2d
+from .conv2d_same import create_conv2d_pad
+
+
+def create_conv2d(in_chs, out_chs, kernel_size, **kwargs):
+    """ Select a 2d convolution implementation based on arguments
+    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
+
+    Used extensively by EfficientNet, MobileNetv3 and related networks.
+    """
+    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
+    if isinstance(kernel_size, list):
+        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
+        # We're going to use only lists for defining the MixedConv2d kernel groups,
+        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        depthwise = kwargs.pop('depthwise', False)
+        groups = out_chs if depthwise else 1
+        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
+            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+        else:
+            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
+    return m
--- a/timm/models/layers/drop.py
+++ b/timm/models/layers/drop.py
@ -0,0 +1,109 @@
+""" DropBlock, DropPath
+
+PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
+
+Papers:
+DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
+
+Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
+
+Code:
+DropBlock impl inspired by two Tensorflow impl that I liked:
+ - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
+ - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
+
+Hacked together by Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import math
+
+
+def drop_block_2d(x, drop_prob=0.1, training=False, block_size=7, gamma_scale=1.0, drop_with_noise=False):
+    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+
+    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
+    runs with success, but needs further validation and possibly optimization for lower runtime impact.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    _, _, height, width = x.shape
+    total_size = width * height
+    clipped_block_size = min(block_size, min(width, height))
+    # seed_drop_rate, the gamma parameter
+    seed_drop_rate = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
+            (width - block_size + 1) *
+            (height - block_size + 1))
+
+    # Forces the block to be inside the feature map.
+    w_i, h_i = torch.meshgrid(torch.arange(width).to(x.device), torch.arange(height).to(x.device))
+    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < width - (clipped_block_size - 1) // 2)) & \
+                  ((h_i >= clipped_block_size // 2) & (h_i < height - (clipped_block_size - 1) // 2))
+    valid_block = torch.reshape(valid_block, (1, 1, height, width)).float()
+
+    uniform_noise = torch.rand_like(x, dtype=torch.float32)
+    block_mask = ((2 - seed_drop_rate - valid_block + uniform_noise) >= 1).float()
+    block_mask = -F.max_pool2d(
+        -block_mask,
+        kernel_size=clipped_block_size,  # block_size, ???
+        stride=1,
+        padding=clipped_block_size // 2)
+
+    if drop_with_noise:
+        normal_noise = torch.randn_like(x)
+        x = x * block_mask + normal_noise * (1 - block_mask)
+    else:
+        normalize_scale = block_mask.numel() / (torch.sum(block_mask) + 1e-7)
+        x = x * block_mask * normalize_scale
+    return x
+
+
+class DropBlock2d(nn.Module):
+    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    """
+    def __init__(self,
+                 drop_prob=0.1,
+                 block_size=7,
+                 gamma_scale=1.0,
+                 with_noise=False):
+        super(DropBlock2d, self).__init__()
+        self.drop_prob = drop_prob
+        self.gamma_scale = gamma_scale
+        self.block_size = block_size
+        self.with_noise = with_noise
+
+    def forward(self, x):
+        return drop_block_2d(x, self.drop_prob, self.training, self.block_size, self.gamma_scale, self.with_noise)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    random_tensor = keep_prob + torch.rand((x.size()[0], 1, 1, 1), dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.ModuleDict):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
--- a/timm/models/layers/eca.py
+++ b/timm/models/layers/eca.py
@ -0,0 +1,124 @@
+"""
+ECA module from ECAnet
+
+paper: ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks
+https://arxiv.org/abs/1910.03151
+
+Original ECA model borrowed from https://github.com/BangguWu/ECANet
+
+Modified circular ECA implementation and adaption for use in timm package
+by Chris Ha https://github.com/VRandme
+
+Original License:
+
+MIT License
+
+Copyright (c) 2019 BangguWu, Qilong Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import math
+from torch import nn
+import torch.nn.functional as F
+
+
+class EcaModule(nn.Module):
+    """Constructs an ECA module.
+
+    Args:
+        channels: Number of channels of the input feature map for use in adaptive kernel sizes
+            for actual calculations according to channel.
+            gamma, beta: when channel is given parameters of mapping function
+            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
+            (default=None. if channel size not given, use k_size given for kernel size.)
+        kernel_size: Adaptive selection of kernel size (default=3)
+    """
+    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
+        super(EcaModule, self).__init__()
+        assert kernel_size % 2 == 1
+
+        if channels is not None:
+            t = int(abs(math.log(channels, 2) + beta) / gamma)
+            kernel_size = max(t if t % 2 else t + 1, 3)
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
+
+    def forward(self, x):
+        # Feature descriptor on the global spatial information
+        y = self.avg_pool(x)
+        # Reshape for convolution
+        y = y.view(x.shape[0], 1, -1)
+        # Two different branches of ECA module
+        y = self.conv(y)
+        # Multi-scale information fusion
+        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
+        return x * y.expand_as(x)
+
+
+class CecaModule(nn.Module):
+    """Constructs a circular ECA module.
+
+    ECA module where the conv uses circular padding rather than zero padding.
+    Unlike the spatial dimension, the channels do not have inherent ordering nor
+    locality. Although this module in essence, applies such an assumption, it is unnecessary
+    to limit the channels on either "edge" from being circularly adapted to each other.
+    This will fundamentally increase connectivity and possibly increase performance metrics
+    (accuracy, robustness), without signficantly impacting resource metrics
+    (parameter size, throughput,latency, etc)
+
+    Args:
+        channels: Number of channels of the input feature map for use in adaptive kernel sizes
+            for actual calculations according to channel.
+            gamma, beta: when channel is given parameters of mapping function
+            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
+            (default=None. if channel size not given, use k_size given for kernel size.)
+        kernel_size: Adaptive selection of kernel size (default=3)
+    """
+
+    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
+        super(CecaModule, self).__init__()
+        assert kernel_size % 2 == 1
+
+        if channels is not None:
+            t = int(abs(math.log(channels, 2) + beta) / gamma)
+            kernel_size = max(t if t % 2 else t + 1, 3)
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        #pytorch circular padding mode is buggy as of pytorch 1.4
+        #see https://github.com/pytorch/pytorch/pull/17240
+
+        #implement manual circular padding
+        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False)
+        self.padding = (kernel_size - 1) // 2
+
+    def forward(self, x):
+        # Feature descriptor on the global spatial information
+        y = self.avg_pool(x)
+
+        # Manually implement circular padding, F.pad does not seemed to be bugged
+        y = F.pad(y.view(x.shape[0], 1, -1), (self.padding, self.padding), mode='circular')
+
+        # Two different branches of ECA module
+        y = self.conv(y)
+
+        # Multi-scale information fusion
+        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
+
+        return x * y.expand_as(x)
--- a/timm/models/layers/helpers.py
+++ b/timm/models/layers/helpers.py
@ -0,0 +1,27 @@
+""" Layer/Module Helpers
+
+Hacked together by Ross Wightman
+"""
+from itertools import repeat
+from torch._six import container_abcs
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+tup_single = _ntuple(1)
+tup_pair = _ntuple(2)
+tup_triple = _ntuple(3)
+tup_quadruple = _ntuple(4)
+
+
+
+
+
+
--- a/timm/models/layers/median_pool.py
+++ b/timm/models/layers/median_pool.py
--- a/timm/models/layers/mixed_conv2d.py
+++ b/timm/models/layers/mixed_conv2d.py
@ -0,0 +1,51 @@
+""" PyTorch Mixed Convolution
+
+Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595)
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+
+from .conv2d_same import create_conv2d_pad
+
+
+def _split_channels(num_chan, num_groups):
+    split = [num_chan // num_groups for _ in range(num_groups)]
+    split[0] += num_chan - sum(split)
+    return split
+
+
+class MixedConv2d(nn.ModuleDict):
+    """ Mixed Grouped Convolution
+
+    Based on MDConv and GroupedConv in MixNet impl:
+      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
+        super(MixedConv2d, self).__init__()
+
+        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+        num_groups = len(kernel_size)
+        in_splits = _split_channels(in_channels, num_groups)
+        out_splits = _split_channels(out_channels, num_groups)
+        self.in_channels = sum(in_splits)
+        self.out_channels = sum(out_splits)
+        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+            conv_groups = out_ch if depthwise else 1
+            # use add_module to keep key space clean
+            self.add_module(
+                str(idx),
+                create_conv2d_pad(
+                    in_ch, out_ch, k, stride=stride,
+                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
+            )
+        self.splits = in_splits
+
+    def forward(self, x):
+        x_split = torch.split(x, self.splits, 1)
+        x_out = [c(x_split[i]) for i, c in enumerate(self.values())]
+        x = torch.cat(x_out, 1)
+        return x
--- a/timm/models/layers/padding.py
+++ b/timm/models/layers/padding.py
@ -0,0 +1,33 @@
+""" Padding Helpers
+
+Hacked together by Ross Wightman
+"""
+import math
+from typing import List
+
+import torch.nn.functional as F
+
+
+# Calculate symmetric padding for a convolution
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+
+
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1)):
+    ih, iw = x.size()[-2:]
+    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
+    return x
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@ -0,0 +1,21 @@
+from torch import nn as nn
+
+
+class SEModule(nn.Module):
+
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        reduction_channels = max(channels // reduction, 8)
+        self.fc1 = nn.Conv2d(
+            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
+        self.act = act_layer(inplace=True)
+        self.fc2 = nn.Conv2d(
+            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
+
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.fc1(x_se)
+        x_se = self.act(x_se)
+        x_se = self.fc2(x_se)
+        return x * x_se.sigmoid()
--- a/timm/models/layers/selective_kernel.py
+++ b/timm/models/layers/selective_kernel.py
@ -0,0 +1,120 @@
+""" Selective Kernel Convolution/Attention
+
+Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
+
+Hacked together by Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+
+from .conv_bn_act import ConvBnAct
+
+
+def _kernel_valid(k):
+    if isinstance(k, (list, tuple)):
+        for ki in k:
+            return _kernel_valid(ki)
+    assert k >= 3 and k % 2
+
+
+class SelectiveKernelAttn(nn.Module):
+    def __init__(self, channels, num_paths=2, attn_channels=32,
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        """ Selective Kernel Attention Module
+
+        Selective Kernel attention mechanism factored out into its own module.
+
+        """
+        super(SelectiveKernelAttn, self).__init__()
+        self.num_paths = num_paths
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
+        self.bn = norm_layer(attn_channels)
+        self.act = act_layer(inplace=True)
+        self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        assert x.shape[1] == self.num_paths
+        x = torch.sum(x, dim=1)
+        x = self.pool(x)
+        x = self.fc_reduce(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.fc_select(x)
+        B, C, H, W = x.shape
+        x = x.view(B, self.num_paths, C // self.num_paths, H, W)
+        x = torch.softmax(x, dim=1)
+        return x
+
+
+class SelectiveKernelConv(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size=None, stride=1, dilation=1, groups=1,
+                 attn_reduction=16, min_attn_channels=32, keep_3x3=True, split_input=False,
+                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        """ Selective Kernel Convolution Module
+
+        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
+
+        Largest change is the input split, which divides the input channels across each convolution path, this can
+        be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps
+        the parameter count from ballooning when the convolutions themselves don't have groups, but still provides
+        a noteworthy increase in performance over similar param count models without this attention layer. -Ross W
+
+        Args:
+            in_channels (int):  module input (feature) channel count
+            out_channels (int):  module output (feature) channel count
+            kernel_size (int, list): kernel size for each convolution branch
+            stride (int): stride for convolutions
+            dilation (int): dilation for module as a whole, impacts dilation of each branch
+            groups (int): number of groups for each branch
+            attn_reduction (int, float): reduction factor for attention features
+            min_attn_channels (int): minimum attention feature channels
+            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
+            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
+                can be viewed as grouping by path, output expands to module out_channels count
+            drop_block (nn.Module): drop block module
+            act_layer (nn.Module): activation layer to use
+            norm_layer (nn.Module): batchnorm/norm layer to use
+        """
+        super(SelectiveKernelConv, self).__init__()
+        kernel_size = kernel_size or [3, 5]  # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation
+        _kernel_valid(kernel_size)
+        if not isinstance(kernel_size, list):
+            kernel_size = [kernel_size] * 2
+        if keep_3x3:
+            dilation = [dilation * (k - 1) // 2 for k in kernel_size]
+            kernel_size = [3] * len(kernel_size)
+        else:
+            dilation = [dilation] * len(kernel_size)
+        self.num_paths = len(kernel_size)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.split_input = split_input
+        if self.split_input:
+            assert in_channels % self.num_paths == 0
+            in_channels = in_channels // self.num_paths
+        groups = min(out_channels, groups)
+
+        conv_kwargs = dict(
+            stride=stride, groups=groups, drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
+        self.paths = nn.ModuleList([
+            ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
+            for k, d in zip(kernel_size, dilation)])
+
+        attn_channels = max(int(out_channels / attn_reduction), min_attn_channels)
+        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
+        self.drop_block = drop_block
+
+    def forward(self, x):
+        if self.split_input:
+            x_split = torch.split(x, self.in_channels // self.num_paths, 1)
+            x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)]
+        else:
+            x_paths = [op(x) for op in self.paths]
+        x = torch.stack(x_paths, dim=1)
+        x_attn = self.attn(x)
+        x = x * x_attn
+        x = torch.sum(x, dim=1)
+        return x
--- a/timm/models/layers/split_batchnorm.py
+++ b/timm/models/layers/split_batchnorm.py
--- a/timm/models/layers/test_time_pool.py
+++ b/timm/models/layers/test_time_pool.py
@ -1,3 +1,8 @@
+""" Test Time Pooling (Average-Max Pool)
+
+Hacked together by Ross Wightman
+"""
+
 import logging
 from torch import nn
 import torch.nn.functional as F
@ -29,6 +34,8 @@ class TestTimePoolHead(nn.Module):

 def apply_test_time_pool(model, config, args):
    test_time_pool = False
+    if not hasattr(model, 'default_cfg') or not model.default_cfg:
+        return model, False
    if not args.no_test_pool and \
            config['input_size'][-1] > model.default_cfg['input_size'][-1] and \
            config['input_size'][-2] > model.default_cfg['input_size'][-2]:
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -7,15 +7,12 @@ Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244

 Hacked together by Ross Wightman
 """
-import torch.nn as nn
-import torch.nn.functional as F

 from .efficientnet_builder import *
-from .activations import HardSwish, hard_sigmoid
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
-from .conv2d_layers import select_conv2d
+from .layers import SelectAdaptivePool2d, create_conv2d
+from .layers.activations import HardSwish, hard_sigmoid
 from .feature_hooks import FeatureHooks
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD

@ -74,7 +71,7 @@ class MobileNetV3(nn.Module):
    """

    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
-                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(MobileNetV3, self).__init__()
        
@ -85,7 +82,7 @@ class MobileNetV3(nn.Module):

        # Stem
        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -93,14 +90,14 @@ class MobileNetV3(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs
        
        # Head + Pooling
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
+        self.conv_head = create_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
        self.act2 = act_layer(inplace=True)

        # Classifier
@ -151,7 +148,7 @@ class MobileNetV3Features(nn.Module):

    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
-                 act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0., se_kwargs=None,
+                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., se_kwargs=None,
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(MobileNetV3Features, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -165,7 +162,7 @@ class MobileNetV3Features(nn.Module):

        # Stem
        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -173,7 +170,7 @@ class MobileNetV3Features(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
        self._in_chs = builder.in_chs
--- a/timm/models/nasnet.py
+++ b/timm/models/nasnet.py
@ -4,7 +4,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d


 __all__ = ['NASNetALarge']
--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@ -14,7 +14,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d

 __all__ = ['PNASNet5Large']

--- a/timm/models/res2net.py
+++ b/timm/models/res2net.py
@ -8,10 +8,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-from .resnet import ResNet, SEModule
+from .resnet import ResNet
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SEModule
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 __all__ = []
@ -53,15 +53,16 @@ class Bottle2neck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=26, scale=4, use_se=False,
-                 act_layer=nn.ReLU, norm_layer=None, dilation=1, previous_dilation=1, **_):
+                 cardinality=1, base_width=26, scale=4, dilation=1, first_dilation=None,
+                 act_layer=nn.ReLU, norm_layer=None, attn_layer=None, **_):
        super(Bottle2neck, self).__init__()
        self.scale = scale
        self.is_first = stride > 1 or downsample is not None
        self.num_scales = max(1, scale - 1)
        width = int(math.floor(planes * (base_width / 64.0))) * cardinality
-        outplanes = planes * self.expansion
        self.width = width
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation

        self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
        self.bn1 = norm_layer(width * scale)
@ -70,8 +71,8 @@ class Bottle2neck(nn.Module):
        bns = []
        for i in range(self.num_scales):
            convs.append(nn.Conv2d(
-                width, width, kernel_size=3, stride=stride, padding=dilation,
-                dilation=dilation, groups=cardinality, bias=False))
+                width, width, kernel_size=3, stride=stride, padding=first_dilation,
+                dilation=first_dilation, groups=cardinality, bias=False))
            bns.append(norm_layer(width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)
@ -81,11 +82,14 @@ class Bottle2neck(nn.Module):

        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+        self.se = attn_layer(outplanes) if attn_layer is not None else None

        self.relu = act_layer(inplace=True)
        self.downsample = downsample

+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn3.weight)
+
    def forward(self, x):
        residual = x

--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -7,13 +7,12 @@ ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered ste
 """
 import math

-import torch
 import torch.nn as nn
 import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d, DropBlock2d, DropPath, AvgPool2dSame, create_attn
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD


@ -100,136 +99,182 @@ default_cfgs = {
    'seresnext26tn_32x4d': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26tn_32x4d-569cb627.pth',
        interpolation='bicubic'),
+    'ecaresnext26tn_32x4d': _cfg(
+        url='',
+        interpolation='bicubic'),
+    'ecaresnet18': _cfg(),
+    'ecaresnet50': _cfg(),
 }


-def _get_padding(kernel_size, stride, dilation=1):
+def get_padding(kernel_size, stride, dilation=1):
    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    return padding


-class SEModule(nn.Module):
-
-    def __init__(self, channels, reduction_channels):
-        super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc1 = nn.Conv2d(
-            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
-        self.relu = nn.ReLU(inplace=True)
-        self.fc2 = nn.Conv2d(
-            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
-
-    def forward(self, x):
-        x_se = self.avg_pool(x)
-        x_se = self.fc1(x_se)
-        x_se = self.relu(x_se)
-        x_se = self.fc2(x_se)
-        return x * x_se.sigmoid()
-
-
 class BasicBlock(nn.Module):
-    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 1

-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=64, use_se=False,
-                 reduce_first=1, dilation=1, previous_dilation=1, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+                 attn_layer=None, drop_block=None, drop_path=None):
        super(BasicBlock, self).__init__()

        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
        assert base_width == 64, 'BasicBlock doest not support changing base width'
        first_planes = planes // reduce_first
        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation

        self.conv1 = nn.Conv2d(
-            inplanes, first_planes, kernel_size=3, stride=stride, padding=dilation,
-            dilation=dilation, bias=False)
+            inplanes, first_planes, kernel_size=3, stride=stride, padding=first_dilation,
+            dilation=first_dilation, bias=False)
        self.bn1 = norm_layer(first_planes)
        self.act1 = act_layer(inplace=True)
        self.conv2 = nn.Conv2d(
-            first_planes, outplanes, kernel_size=3, padding=previous_dilation,
-            dilation=previous_dilation, bias=False)
+            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
        self.bn2 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+
+        self.se = create_attn(attn_layer, outplanes)
+
        self.act2 = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
+        self.drop_block = drop_block
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn2.weight)

    def forward(self, x):
        residual = x

-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.act1(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)
+        x = self.act1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)

        if self.se is not None:
-            out = self.se(out)
+            x = self.se(x)

-        if self.downsample is not None:
-            residual = self.downsample(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)

-        out += residual
-        out = self.act2(out)
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+        x += residual
+        x = self.act2(x)

-        return out
+        return x


 class Bottleneck(nn.Module):
    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 4

-    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=64, use_se=False,
-                 reduce_first=1, dilation=1, previous_dilation=1, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+                 attn_layer=None, drop_block=None, drop_path=None):
        super(Bottleneck, self).__init__()

        width = int(math.floor(planes * (base_width / 64)) * cardinality)
        first_planes = width // reduce_first
        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation

        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(first_planes)
        self.act1 = act_layer(inplace=True)
        self.conv2 = nn.Conv2d(
            first_planes, width, kernel_size=3, stride=stride,
-            padding=dilation, dilation=dilation, groups=cardinality, bias=False)
+            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
        self.bn2 = norm_layer(width)
        self.act2 = act_layer(inplace=True)
        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+
+        self.se = create_attn(attn_layer, outplanes)
+
        self.act3 = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
+        self.drop_block = drop_block
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn3.weight)

    def forward(self, x):
        residual = x

-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.act1(out)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)
+        x = self.act1(x)

-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.act2(out)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)
+        x = self.act2(x)

-        out = self.conv3(out)
-        out = self.bn3(out)
+        x = self.conv3(x)
+        x = self.bn3(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)

        if self.se is not None:
-            out = self.se(out)
+            x = self.se(x)
+
+        if self.drop_path is not None:
+            x = self.drop_path(x)

        if self.downsample is not None:
-            residual = self.downsample(x)
+            residual = self.downsample(residual)
+        x += residual
+        x = self.act3(x)

-        out += residual
-        out = self.act3(out)
+        return x

-        return out
+
+def downsample_conv(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
+    p = get_padding(kernel_size, stride, first_dilation)
+
+    return nn.Sequential(*[
+        nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
+        norm_layer(out_channels)
+    ])
+
+
+def downsample_avg(
+        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    if stride == 1 and dilation == 1:
+        pool = nn.Identity()
+    else:
+        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+
+    return nn.Sequential(*[
+        pool,
+        nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
+        norm_layer(out_channels)
+    ])


 class ResNet(nn.Module):
@ -273,8 +318,6 @@ class ResNet(nn.Module):
        Number of classification classes.
    in_chans : int, default 3
        Number of input (color) channels.
-    use_se : bool, default False
-        Enable Squeeze-Excitation module in blocks
    cardinality : int, default 1
        Number of convolution groups for 3x3 conv in Bottleneck.
    base_width : int, default 64
@ -303,11 +346,11 @@ class ResNet(nn.Module):
    global_pool : str, default 'avg'
        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
    """
-    def __init__(self, block, layers, num_classes=1000, in_chans=3, use_se=False,
+    def __init__(self, block, layers, num_classes=1000, in_chans=3,
                 cardinality=1, base_width=64, stem_width=64, stem_type='',
                 block_reduce_first=1, down_kernel_size=1, avg_down=False, output_stride=32,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0.0, global_pool='avg',
-                 zero_init_last_bn=True, block_args=None):
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0.0, drop_path_rate=0.,
+                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None):
        block_args = block_args or dict()
        self.num_classes = num_classes
        deep_stem = 'deep' in stem_type
@ -339,6 +382,9 @@ class ResNet(nn.Module):
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Feature Blocks
+        dp = DropPath(drop_path_rate) if drop_path_rate else None
+        db_3 = DropBlock2d(drop_block_rate, 7, 0.25) if drop_block_rate else None
+        db_4 = DropBlock2d(drop_block_rate, 7, 1.00) if drop_block_rate else None
        channels, strides, dilations = [64, 128, 256, 512], [1, 2, 2, 2], [1] * 4
        if output_stride == 16:
            strides[3] = 1
@ -348,61 +394,47 @@ class ResNet(nn.Module):
            dilations[2:4] = [2, 4]
        else:
            assert output_stride == 32
-        llargs = list(zip(channels, layers, strides, dilations))
-        lkwargs = dict(
-            use_se=use_se, reduce_first=block_reduce_first, act_layer=act_layer, norm_layer=norm_layer,
-            avg_down=avg_down, down_kernel_size=down_kernel_size, **block_args)
-        self.layer1 = self._make_layer(block, *llargs[0], **lkwargs)
-        self.layer2 = self._make_layer(block, *llargs[1], **lkwargs)
-        self.layer3 = self._make_layer(block, *llargs[2], **lkwargs)
-        self.layer4 = self._make_layer(block, *llargs[3], **lkwargs)
+        layer_args = list(zip(channels, layers, strides, dilations))
+        layer_kwargs = dict(
+            reduce_first=block_reduce_first, act_layer=act_layer, norm_layer=norm_layer,
+            avg_down=avg_down, down_kernel_size=down_kernel_size, drop_path=dp, **block_args)
+        self.layer1 = self._make_layer(block, *layer_args[0], **layer_kwargs)
+        self.layer2 = self._make_layer(block, *layer_args[1], **layer_kwargs)
+        self.layer3 = self._make_layer(block, drop_block=db_3, *layer_args[2], **layer_kwargs)
+        self.layer4 = self._make_layer(block, drop_block=db_4, *layer_args[3], **layer_kwargs)

        # Head (Pooling and Classifier)
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_features = 512 * block.expansion
        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)

-        last_bn_name = 'bn3' if 'Bottle' in block.__name__ else 'bn2'
        for n, m in self.named_modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
-                if zero_init_last_bn and 'layer' in n and last_bn_name in n:
-                    # Initialize weight/gamma of last BN in each residual block to zero
-                    nn.init.constant_(m.weight, 0.)
-                else:
-                    nn.init.constant_(m.weight, 1.)
+                nn.init.constant_(m.weight, 1.)
                nn.init.constant_(m.bias, 0.)
+        if zero_init_last_bn:
+            for m in self.modules():
+                if hasattr(m, 'zero_init_last_bn'):
+                    m.zero_init_last_bn()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, reduce_first=1,
-                    use_se=False, avg_down=False, down_kernel_size=1, **kwargs):
-        norm_layer = kwargs.get('norm_layer')
+                    avg_down=False, down_kernel_size=1, **kwargs):
        downsample = None
-        down_kernel_size = 1 if stride == 1 and dilation == 1 else down_kernel_size
+        first_dilation = 1 if dilation in (1, 2) else 2
        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample_padding = _get_padding(down_kernel_size, stride)
-            downsample_layers = []
-            conv_stride = stride
-            if avg_down:
-                avg_stride = stride if dilation == 1 else 1
-                conv_stride = 1
-                downsample_layers = [nn.AvgPool2d(avg_stride, avg_stride, ceil_mode=True, count_include_pad=False)]
-            downsample_layers += [
-                nn.Conv2d(self.inplanes, planes * block.expansion, down_kernel_size,
-                          stride=conv_stride, padding=downsample_padding, bias=False),
-                norm_layer(planes * block.expansion)]
-            downsample = nn.Sequential(*downsample_layers)
+            downsample_args = dict(
+                in_channels=self.inplanes, out_channels=planes * block.expansion, kernel_size=down_kernel_size,
+                stride=stride, dilation=dilation, first_dilation=first_dilation, norm_layer=kwargs.get('norm_layer'))
+            downsample = downsample_avg(**downsample_args) if avg_down else downsample_conv(**downsample_args)

-        first_dilation = 1 if dilation in (1, 2) else 2
-        bkwargs = dict(
+        block_kwargs = dict(
            cardinality=self.cardinality, base_width=self.base_width, reduce_first=reduce_first,
-            use_se=use_se, **kwargs)
-        layers = [block(
-            self.inplanes, planes, stride, downsample, dilation=first_dilation, previous_dilation=dilation, **bkwargs)]
+            dilation=dilation, **kwargs)
+        layers = [block(self.inplanes, planes, stride, downsample, first_dilation=first_dilation, **block_kwargs)]
        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(
-                self.inplanes, planes, dilation=dilation, previous_dilation=dilation, **bkwargs))
+        layers += [block(self.inplanes, planes, **block_kwargs) for _ in range(1, blocks)]

        return nn.Sequential(*layers)

@ -430,8 +462,8 @@ class ResNet(nn.Module):
    def forward(self, x):
        x = self.forward_features(x)
        x = self.global_pool(x).flatten(1)
-        if self.drop_rate > 0.:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
        x = self.fc(x)
        return x

@ -903,9 +935,8 @@ def seresnext26d_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
    """
    default_cfg = default_cfgs['seresnext26d_32x4d']
    model = ResNet(
-        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep', avg_down=True, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, stem_type='deep', avg_down=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -921,8 +952,8 @@ def seresnext26t_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
    default_cfg = default_cfgs['seresnext26t_32x4d']
    model = ResNet(
        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep_tiered', avg_down=True, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        stem_width=32, stem_type='deep_tiered', avg_down=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -938,8 +969,55 @@ def seresnext26tn_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
    default_cfg = default_cfgs['seresnext26tn_32x4d']
    model = ResNet(
        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True, use_se=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def ecaresnext26tn_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs an ECA-ResNeXt-26-TN model.
+    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
+    in the deep stem. The channel number of the middle stem conv is narrower than the 'T' variant.
+    this model replaces SE module with the ECA module
+    """
+    default_cfg = default_cfgs['ecaresnext26tn_32x4d']
+    block_args = dict(attn_layer='eca')
+    model = ResNet(
+        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
+        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def ecaresnet18(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """ Constructs an ECA-ResNet-18 model.
+    """
+    default_cfg = default_cfgs['ecaresnet18']
+    block_args = dict(attn_layer='eca')
+    model = ResNet(
+        BasicBlock, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def ecaresnet50(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs an ECA-ResNet-50 model.
+    """
+    default_cfg = default_cfgs['ecaresnet50']
+    block_args = dict(attn_layer='eca')
+    model = ResNet(
+        Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
--- a/timm/models/selecsls.py
+++ b/timm/models/selecsls.py
@ -17,7 +17,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 __all__ = ['SelecSLS']  # model_registry will add each entrypoint fn to this
--- a/timm/models/senet.py
+++ b/timm/models/senet.py
@ -16,7 +16,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

 __all__ = ['SENet']
--- a/timm/models/sknet.py
+++ b/timm/models/sknet.py
@ -0,0 +1,237 @@
+""" Selective Kernel Networks (ResNet base)
+
+Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
+
+This was inspired by reading 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268)
+and a streamlined impl at https://github.com/clovaai/assembled-cnn but I ended up building something closer
+to the original paper with some modifications of my own to better balance param count vs accuracy.
+
+Hacked together by Ross Wightman
+"""
+import math
+
+from torch import nn as nn
+
+from .registry import register_model
+from .helpers import load_pretrained
+from .layers import SelectiveKernelConv, ConvBnAct, create_attn
+from .resnet import ResNet
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv1', 'classifier': 'fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'skresnet18': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet18_ra-4eec2804.pth'),
+    'skresnet34': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet34_ra-bdc0ccde.pth'),
+    'skresnet50': _cfg(),
+    'skresnet50d': _cfg(),
+    'skresnext50_32x4d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnext50_ra-f40e40bf.pth'),
+}
+
+
+class SelectiveKernelBasic(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+                 sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None,
+                 drop_block=None, drop_path=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None):
+        super(SelectiveKernelBasic, self).__init__()
+
+        sk_kwargs = sk_kwargs or {}
+        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
+        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
+        assert base_width == 64, 'BasicBlock doest not support changing base width'
+        first_planes = planes // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+
+        self.conv1 = SelectiveKernelConv(
+            inplanes, first_planes, stride=stride, dilation=first_dilation, **conv_kwargs, **sk_kwargs)
+        conv_kwargs['act_layer'] = None
+        self.conv2 = ConvBnAct(
+            first_planes, outplanes, kernel_size=3, dilation=dilation, **conv_kwargs)
+        self.se = create_attn(attn_layer, outplanes)
+        self.act = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_block = drop_block
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.conv2.bn.weight)
+
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.se is not None:
+            x = self.se(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+        x += residual
+        x = self.act(x)
+        return x
+
+
+class SelectiveKernelBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 cardinality=1, base_width=64, sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None,
+                 drop_block=None, drop_path=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None):
+        super(SelectiveKernelBottleneck, self).__init__()
+
+        sk_kwargs = sk_kwargs or {}
+        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
+        width = int(math.floor(planes * (base_width / 64)) * cardinality)
+        first_planes = width // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+
+        self.conv1 = ConvBnAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
+        self.conv2 = SelectiveKernelConv(
+            first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality,
+            **conv_kwargs, **sk_kwargs)
+        conv_kwargs['act_layer'] = None
+        self.conv3 = ConvBnAct(width, outplanes, kernel_size=1, **conv_kwargs)
+        self.se = create_attn(attn_layer, outplanes)
+        self.act = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.drop_block = drop_block
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if self.se is not None:
+            x = self.se(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.downsample is not None:
+            residual = self.downsample(residual)
+        x += residual
+        x = self.act(x)
+        return x
+
+
+@register_model
+def skresnet18(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a Selective Kernel ResNet-18 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    default_cfg = default_cfgs['skresnet18']
+    sk_kwargs = dict(
+        min_attn_channels=16,
+        attn_reduction=8,
+        split_input=True
+    )
+    model = ResNet(
+        SelectiveKernelBasic, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans,
+        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def skresnet34(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a Selective Kernel ResNet-34 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    default_cfg = default_cfgs['skresnet34']
+    sk_kwargs = dict(
+        min_attn_channels=16,
+        attn_reduction=8,
+        split_input=True
+    )
+    model = ResNet(
+        SelectiveKernelBasic, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def skresnet50(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a Select Kernel ResNet-50 model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(
+        split_input=True,
+    )
+    default_cfg = default_cfgs['skresnet50']
+    model = ResNet(
+        SelectiveKernelBottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
+        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def skresnet50d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a Select Kernel ResNet-50-D model.
+
+    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
+    variation splits the input channels to the selective convolutions to keep param count down.
+    """
+    sk_kwargs = dict(
+        split_input=True,
+    )
+    default_cfg = default_cfgs['skresnet50d']
+    model = ResNet(
+        SelectiveKernelBottleneck, [3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(sk_kwargs=sk_kwargs),
+        zero_init_last_bn=False, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
+
+
+@register_model
+def skresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
+    """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to
+    the SKNet-50 model in the Select Kernel Paper
+    """
+    default_cfg = default_cfgs['skresnext50_32x4d']
+    model = ResNet(
+        SelectiveKernelBottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
+        num_classes=num_classes, in_chans=in_chans, zero_init_last_bn=False, **kwargs)
+    model.default_cfg = default_cfg
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes, in_chans)
+    return model
--- a/timm/models/xception.py
+++ b/timm/models/xception.py
@ -29,7 +29,7 @@ import torch.nn.functional as F

 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d

 __all__ = ['Xception']

--- a/train.py
+++ b/train.py
@ -81,10 +81,14 @@ parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
                    help='input batch size for training (default: 32)')
 parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
                    help='ratio of validation batch size to training batch size (default: 1)')
-parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
                    help='Dropout rate (default: 0.)')
-parser.add_argument('--drop-connect', type=float, default=0.0, metavar='DROP',
-                    help='Drop connect rate (default: 0.)')
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                    help='Drop path rate (default: None)')
+parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                    help='Drop block rate (default: None)')
 parser.add_argument('--jsd', action='store_true', default=False,
                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
 # Optimizer parameters
@ -242,7 +246,9 @@ def main():
        pretrained=args.pretrained,
        num_classes=args.num_classes,
        drop_rate=args.drop,
-        drop_connect_rate=args.drop_connect,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
        global_pool=args.gp,
        bn_tf=args.bn_tf,
        bn_momentum=args.bn_momentum,
--- a/validate.py
+++ b/validate.py
@ -211,11 +211,24 @@ def main():
        logging.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names)))
        results = []
        try:
+            start_batch_size = args.batch_size
            for m, c in model_cfgs:
+                batch_size = start_batch_size
                args.model = m
                args.checkpoint = c
                result = OrderedDict(model=args.model)
-                r = validate(args)
+                r = {}
+                while not r and batch_size >= args.num_gpu:
+                    try:
+                        args.batch_size = batch_size
+                        print('Validating with batch size: %d' % args.batch_size)
+                        r = validate(args)
+                    except RuntimeError as e:
+                        if batch_size <= args.num_gpu:
+                            print("Validation failed with no ability to reduce batch size. Exiting.")
+                            raise e
+                        batch_size = max(batch_size // 2, args.num_gpu)
+                        print("Validation failed, reducing batch size by 50%")
                result.update(r)
                if args.checkpoint:
                    result['checkpoint'] = args.checkpoint