Merge pull request #88 from rwightman/attention

A lot of attention and much more
5 years ago · e0685dd415
parent f098fda2ca f1860ef3a5
commit e0685dd415
53 changed files with 1660 additions and 551 deletions
--- a/.gitignore
+++ b/.gitignore
@ -104,3 +104,5 @@ venv.bak/
 *.tar
 *.pth
 *.gz
 Untitled.ipynb
 Testing notebook.ipynb
--- a/README.md
+++ b/README.md
@ -2,6 +2,20 @@
 ## What's New
 ### Feb 18, 2020
 * Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268):
  * Move layer/module impl into `layers` subfolder/module of `models` and organize in a more granular fashion
  * ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks
  * Add Selective Kernel Nets on top of ResNet base, pretrained weights
    * skresnet18 - 73% top-1
    * skresnet34 - 76.9% top-1 
    * skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1
  * ECA and CECA (circular padding) attention layer contributed by [Chris Ha](https://github.com/VRandme)
  * CBAM attention experiment (not the best results so far, may remove)
  * Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the `.se` position for all ResNets
  * Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants
 * Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights
 ### Feb 12, 2020
 * Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from [Tensorflow TPU](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet)
@ -86,6 +100,7 @@ Included models:
    * 'Bag of Tricks' / Gluon C, D, E, S variations (https://arxiv.org/abs/1812.01187)
    * Instagram trained / ImageNet tuned ResNeXt101-32x8d to 32x48d from from [facebookresearch](https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/)
    * Res2Net (https://github.com/gasvn/Res2Net, https://arxiv.org/abs/1904.01169)
    * Selective Kernel (SK) Nets (https://arxiv.org/abs/1903.06586)
 * DLA
    * Original (https://github.com/ucbdrive/dla, https://arxiv.org/abs/1707.06484)
    * Res2Net (https://github.com/gasvn/Res2Net, https://arxiv.org/abs/1904.01169)
@ -138,6 +153,8 @@ Several (less common) features that I often utilize in my projects are included.
 * AutoAugment (https://arxiv.org/abs/1805.09501) and RandAugment (https://arxiv.org/abs/1909.13719) ImageNet configurations modeled after impl for EfficientNet training (https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py)
 * AugMix w/ JSD loss (https://arxiv.org/abs/1912.02781), JSD w/ clean + augmented mixing support works with AutoAugment and RandAugment as well
 * SplitBachNorm - allows splitting batch norm layers between clean and augmented (auxiliary batch norm) data
 * DropBlock (https://arxiv.org/abs/1810.12890)
 * Efficient Channel Attention - ECA (https://arxiv.org/abs/1910.03151)
 ## Results
@ -150,9 +167,11 @@ I've leveraged the training scripts in this repository to train a few of the mod
 |---|---|---|---|---|---|
 | efficientnet_b3a | 81.874 (18.126) | 95.840 (4.160) | 12.23M | bicubic | 320 (1.0 crop) |
 | efficientnet_b3 | 81.498 (18.502) | 95.718 (4.282) | 12.23M | bicubic | 300 |
 | skresnext50d_32x4d | 81.278 (18.722) | 95.366 (4.634) | 27.5M | bicubic | 288 (1.0 crop) |
 | efficientnet_b2a | 80.608 (19.392) | 95.310 (4.690) | 9.11M | bicubic | 288 (1.0 crop) |
 | mixnet_xl | 80.478 (19.522) | 94.932 (5.068) | 11.90M | bicubic | 224 |
 | efficientnet_b2 | 80.402 (19.598) | 95.076 (4.924) | 9.11M | bicubic | 260 |
 | skresnext50d_32x4d | 80.156 (19.844) | 94.642 (5.358) | 27.5M | bicubic | 224 |
 | resnext50d_32x4d | 79.674 (20.326) | 94.868 (5.132) | 25.1M | bicubic | 224 |
 | resnet50 | 79.038 (20.962) | 94.390 (5.610) | 25.6M | bicubic | 224 |
 | mixnet_l | 78.976 (21.024 | 94.184 (5.816) | 7.33M | bicubic | 224 |
@ -165,6 +184,7 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | seresnext26d_32x4d | 77.602 (22.398) | 93.608 (6.392) | 16.8M | bicubic | 224 |
 | mixnet_m | 77.256 (22.744) | 93.418 (6.582) | 5.01M | bicubic | 224 |
 | seresnext26_32x4d | 77.104 (22.896) | 93.316 (6.684) | 16.8M | bicubic | 224 |
 | skresnet34 | 76.912 (23.088) | 93.322 (6.678) | 22.2M | bicubic | 224 |
 | resnet26d | 76.68 (23.32) | 93.166 (6.834) | 16M | bicubic | 224 |
 | mixnet_s | 75.988 (24.012) | 92.794 (7.206) | 4.13M | bicubic | 224 |
 | mobilenetv3_100 | 75.634 (24.366) | 92.708 (7.292) | 5.5M | bicubic | 224 |
@ -175,6 +195,7 @@ I've leveraged the training scripts in this repository to train a few of the mod
 | seresnet34 | 74.808 (25.192) | 92.124 (7.876) | 22M | bilinear | 224 |
 | mnasnet_b1 | 74.658 (25.342) | 92.114 (7.886) | 4.38M | bicubic | 224 |
 | spnasnet_100 | 74.084 (25.916)  | 91.818 (8.182) | 4.42M | bilinear | 224 |
 | skresnet18 | 73.038 (26.962) | 91.168 (8.832) | 11.9M | bicubic | 224 |
 | seresnet18 | 71.742 (28.258) | 90.334 (9.666) | 11.8M | bicubic | 224 |
 ### Ported Weights
--- a/hubconf.py
+++ b/hubconf.py
@ -0,0 +1,10 @@
 dependencies = ['torch']
 from timm.models import registry
 current_module = __import__(__name__)
 current_module.__dict__.update(registry._model_entrypoints)
 #for fn_name in registry.list_models():
 #    fn = registry.model_entrypoint(fn_name)
 #    setattr(current_module, fn_name, fn)
--- a/results/results-imagenet-a.csv
+++ b/results/results-imagenet-a.csv
@ -1,8 +1,14 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
 tf_efficientnet_l2_ns_475,62.3733,37.6267,87.0933,12.9067,480.31,475,0.936,bicubic
 tf_efficientnet_l2_ns,62.0267,37.9733,87.96,12.04,480.31,800,0.96,bicubic
 tf_efficientnet_b7_ns,45.76,54.24,74.2133,25.7867,66.35,600,0.949,bicubic
 ig_resnext101_32x48d,41.56,58.44,66.5467,33.4533,828.41,224,0.875,bilinear
 tf_efficientnet_b6_ns,40.4533,59.5467,68.8667,31.1333,43.04,528,0.942,bicubic
 ig_resnext101_32x32d,39.4267,60.5733,63.7867,36.2133,468.53,224,0.875,bilinear
-ig_resnext101_32x16d,36.0,64.0,59.0,41.0,194.03,224,0.875,bilinear
+tf_efficientnet_b5_ns,39.0133,60.9867,68.08,31.92,30.39,456,0.934,bicubic
 ig_resnext101_32x16d,36,64,59,41,194.03,224,0.875,bilinear
 swsl_resnext101_32x8d,32.0133,67.9867,59.44,40.56,88.79,224,0.875,bilinear
 tf_efficientnet_b4_ns,30.7867,69.2133,59.4667,40.5333,19.34,380,0.922,bicubic
 tf_efficientnet_b8_ap,29.5867,70.4133,56.9333,43.0667,87.41,672,0.954,bicubic
 tf_efficientnet_b8,29.3867,70.6133,57.0533,42.9467,87.41,672,0.954,bicubic
 ig_resnext101_32x8d,28.6667,71.3333,52.32,47.68,88.79,224,0.875,bilinear
@ -13,6 +19,7 @@ tf_efficientnet_b7,25.28,74.72,51.6667,48.3333,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ap,24.3467,75.6533,50.44,49.56,43.04,528,0.942,bicubic
 tf_efficientnet_b6,20.3733,79.6267,45.48,54.52,43.04,528,0.942,bicubic
 tf_efficientnet_b5_ap,19.4667,80.5333,44.7333,55.2667,30.39,456,0.934,bicubic
 tf_efficientnet_b3_ns,19.44,80.56,44.6533,55.3467,12.23,300,0.904,bicubic
 swsl_resnext50_32x4d,18.04,81.96,41.9733,58.0267,25.03,224,0.875,bilinear
 ssl_resnext101_32x16d,17.1867,82.8133,39.9333,60.0667,194.03,224,0.875,bilinear
 tf_efficientnet_b5,17.0533,82.9467,41.92,58.08,30.39,456,0.934,bicubic
@ -23,13 +30,15 @@ tf_efficientnet_b4,13.32,86.68,35.5333,64.4667,19.34,380,0.922,bicubic
 pnasnet5large,13.0533,86.9467,32.2267,67.7733,86.06,331,0.875,bicubic
 nasnetalarge,12.56,87.44,33.4267,66.5733,88.75,331,0.875,bicubic
 ssl_resnext101_32x4d,12.1067,87.8933,31.8933,68.1067,44.18,224,0.875,bilinear
 tf_efficientnet_b2_ns,11.7333,88.2667,32.96,67.04,9.11,260,0.89,bicubic
 gluon_senet154,9.8933,90.1067,26.4267,73.5733,115.09,224,0.875,bicubic
 ssl_resnext50_32x4d,9.6533,90.3467,28.4667,71.5333,25.03,224,0.875,bilinear
 senet154,9.4667,90.5333,26.44,73.56,115.09,224,0.875,bilinear
-efficientnet_b3a,9.2533,90.7467,28.4267,71.5733,12.23,320,1.0,bicubic
+efficientnet_b3a,9.2533,90.7467,28.4267,71.5733,12.23,320,1,bicubic
 efficientnet_b3,8.9733,91.0267,28.2267,71.7733,12.23,300,0.904,bicubic
 inception_v4,8.8933,91.1067,24.68,75.32,42.68,299,0.875,bicubic
 gluon_seresnext101_64x4d,8.8667,91.1333,27.28,72.72,88.23,224,0.875,bicubic
 tf_efficientnet_b1_ns,8.6133,91.3867,27.2933,72.7067,7.79,240,0.882,bicubic
 gluon_xception65,8.44,91.56,25.12,74.88,39.92,299,0.875,bicubic
 gluon_resnet152_v1d,8.36,91.64,23.4267,76.5733,60.21,224,0.875,bicubic
 inception_resnet_v2,8.1733,91.8267,23.5733,76.4267,55.84,299,0.8975,bicubic
@ -39,14 +48,15 @@ tf_efficientnet_b3,8.0133,91.9867,25.48,74.52,12.23,300,0.904,bicubic
 ens_adv_inception_resnet_v2,7.9733,92.0267,23.8667,76.1333,55.84,299,0.8975,bicubic
 gluon_resnet152_v1s,7.8533,92.1467,23.1867,76.8133,60.32,224,0.875,bicubic
 gluon_resnext101_64x4d,7.72,92.28,23.3067,76.6933,83.46,224,0.875,bicubic
 skresnext50_32x4d,7.08,92.92,23.0667,76.9333,27.48,224,0.875,bicubic
 ssl_resnet50,7.04,92.96,23.9067,76.0933,25.56,224,0.875,bilinear
-efficientnet_b2a,6.7467,93.2533,23.5067,76.4933,9.11,288,1.0,bicubic
+efficientnet_b2a,6.7467,93.2533,23.5067,76.4933,9.11,288,1,bicubic
 seresnext101_32x4d,6.4,93.6,21.4933,78.5067,48.96,224,0.875,bilinear
 efficientnet_b2,6.0933,93.9067,21.96,78.04,9.11,260,0.875,bicubic
 gluon_resnext101_32x4d,6.0133,93.9867,21.12,78.88,44.18,224,0.875,bicubic
 gluon_resnet101_v1d,5.92,94.08,19.9467,80.0533,44.57,224,0.875,bicubic
 gluon_seresnext50_32x4d,5.7867,94.2133,21.4533,78.5467,27.56,224,0.875,bicubic
-gluon_inception_v3,5.5067,94.4933,20.0,80.0,23.83,299,0.875,bicubic
+gluon_inception_v3,5.5067,94.4933,20,80,23.83,299,0.875,bicubic
 mixnet_xl,5.4667,94.5333,21.08,78.92,11.9,224,0.875,bicubic
 gluon_resnet101_v1s,5.28,94.72,19.56,80.44,44.67,224,0.875,bicubic
 hrnet_w64,5.16,94.84,19.4933,80.5067,128.06,224,0.875,bilinear
@ -69,8 +79,9 @@ inception_v3,4.1867,95.8133,16.2933,83.7067,27.16,299,0.875,bicubic
 tf_efficientnet_b2_ap,4.16,95.84,18.3467,81.6533,9.11,260,0.89,bicubic
 seresnet152,4.1467,95.8533,15.9333,84.0667,66.82,224,0.875,bilinear
 resnext101_32x8d,4.1333,95.8667,16.92,83.08,88.79,224,0.875,bilinear
 tf_efficientnet_b0_ns,4.1333,95.8667,17.68,82.32,5.29,224,0.875,bicubic
 dpn98,4.08,95.92,15.96,84.04,61.57,224,0.875,bicubic
-res2net101_26w_4s,4.0,96.0,14.8667,85.1333,45.21,224,0.875,bilinear
+res2net101_26w_4s,4,96,14.8667,85.1333,45.21,224,0.875,bilinear
 efficientnet_b1,3.9733,96.0267,15.7733,84.2267,7.79,240,0.875,bicubic
 tf_efficientnet_b2,3.76,96.24,16.5867,83.4133,9.11,260,0.89,bicubic
 hrnet_w30,3.68,96.32,15.5733,84.4267,37.71,224,0.875,bilinear
@ -102,6 +113,7 @@ dla60_res2net,2.64,97.36,14.1733,85.8267,21.15,224,0.875,bilinear
 gluon_resnet101_v1b,2.6133,97.3867,13.56,86.44,44.55,224,0.875,bicubic
 dla60x,2.6,97.4,13.3467,86.6533,17.65,224,0.875,bilinear
 mixnet_m,2.5467,97.4533,12.4133,87.5867,5.01,224,0.875,bicubic
 efficientnet_es,2.3733,97.6267,13.8267,86.1733,5.44,224,0.875,bicubic
 resnet152,2.36,97.64,12.2,87.8,60.19,224,0.875,bilinear
 swsl_resnet18,2.3467,97.6533,11.2267,88.7733,11.69,224,0.875,bilinear
 wide_resnet50_2,2.32,97.68,11.8267,88.1733,68.88,224,0.875,bilinear
@ -133,7 +145,7 @@ ssl_resnet18,1.3867,98.6133,8.2,91.8,11.69,224,0.875,bilinear
 dla60,1.3333,98.6667,9.4667,90.5333,22.33,224,0.875,bilinear
 dpn68,1.32,98.68,8.8267,91.1733,12.61,224,0.875,bicubic
 res2net50_48w_2s,1.2933,98.7067,8.9333,91.0667,25.29,224,0.875,bilinear
-tf_mixnet_s,1.2667,98.7333,8.7467,91.2533,4.13,224,0.875,bicubic
+tf_mixnet_s,1.2667,98.7333,8.7333,91.2667,4.13,224,0.875,bicubic
 fbnetc_100,1.24,98.76,8.76,91.24,5.57,224,0.875,bilinear
 resnet26d,1.24,98.76,9.32,90.68,16.01,224,0.875,bicubic
 tf_mobilenetv3_large_100,1.1867,98.8133,7.9467,92.0533,5.48,224,0.875,bilinear
@ -143,9 +155,10 @@ seresnet34,1.12,98.88,7.4267,92.5733,21.96,224,0.875,bilinear
 tf_efficientnet_es,1.12,98.88,8.5867,91.4133,5.44,224,0.875,bicubic
 spnasnet_100,1.1067,98.8933,8.2133,91.7867,4.42,224,0.875,bilinear
 dla34,1.08,98.92,7.68,92.32,15.78,224,0.875,bilinear
-resnet34,1.0,99.0,7.5333,92.4667,21.8,224,0.875,bilinear
+resnet34,1,99,7.5333,92.4667,21.8,224,0.875,bilinear
 gluon_resnet34_v1b,0.8933,99.1067,6.6,93.4,21.8,224,0.875,bicubic
 hrnet_w18_small_v2,0.8933,99.1067,7.3867,92.6133,15.6,224,0.875,bilinear
 skresnet18,0.88,99.12,7.3467,92.6533,11.96,224,0.875,bicubic
 tf_mobilenetv3_large_075,0.88,99.12,6.72,93.28,3.99,224,0.875,bilinear
 mnasnet_100,0.8667,99.1333,7.8267,92.1733,4.38,224,0.875,bicubic
 tf_mobilenetv3_small_100,0.7467,99.2533,4.6667,95.3333,2.54,224,0.875,bilinear
@ -153,7 +166,7 @@ seresnet18,0.7333,99.2667,6.0267,93.9733,11.78,224,0.875,bicubic
 densenet121,0.68,99.32,6.8933,93.1067,7.98,224,0.875,bicubic
 tf_mobilenetv3_small_075,0.6533,99.3467,4.1867,95.8133,2.04,224,0.875,bilinear
 tv_resnet34,0.6,99.4,5.5333,94.4667,21.8,224,0.875,bilinear
-resnet26,0.5867,99.4133,6.8933,93.1067,16.0,224,0.875,bicubic
+resnet26,0.5867,99.4133,6.8933,93.1067,16,224,0.875,bicubic
 dla46_c,0.52,99.48,4.1733,95.8267,1.31,224,0.875,bilinear
 dla60x_c,0.48,99.52,5.2133,94.7867,1.34,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,0.48,99.52,4.88,95.12,3.92,224,0.875,bilinear
@ -162,4 +175,4 @@ dla46x_c,0.4133,99.5867,4.44,95.56,1.08,224,0.875,bilinear
 gluon_resnet18_v1b,0.3867,99.6133,4.7867,95.2133,11.69,224,0.875,bicubic
 tf_mobilenetv3_small_minimal_100,0.36,99.64,2.8667,97.1333,2.04,224,0.875,bilinear
 resnet18,0.2933,99.7067,4.04,95.96,11.69,224,0.875,bilinear
-tv_resnet50,0.0,100.0,2.9067,97.0933,25.56,224,0.875,bilinear
+tv_resnet50,0,100,2.9067,97.0933,25.56,224,0.875,bilinear
--- a/results/results-imagenet.csv
+++ b/results/results-imagenet.csv
@ -1,7 +1,13 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
 tf_efficientnet_l2_ns,88.352,11.648,98.65,1.35,480.31,800,0.961,bicubic
 tf_efficientnet_l2_ns_475,88.234,11.766,98.546,1.454,480.31,475,0.936,bicubic
 tf_efficientnet_b7_ns,86.84,13.16,98.094,1.906,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ns,86.452,13.548,97.882,2.118,43.04,528,0.942,bicubic
 tf_efficientnet_b5_ns,86.088,13.912,97.752,2.248,30.39,456,0.934,bicubic
 ig_resnext101_32x48d,85.428,14.572,97.572,2.428,828.41,224,0.875,bilinear
 tf_efficientnet_b8,85.37,14.63,97.39,2.61,87.41,672,0.954,bicubic
 tf_efficientnet_b8_ap,85.37,14.63,97.294,2.706,87.41,672,0.954,bicubic
 tf_efficientnet_b4_ns,85.162,14.838,97.47,2.53,19.34,380,0.922,bicubic
 tf_efficientnet_b7_ap,85.12,14.88,97.252,2.748,66.35,600,0.949,bicubic
 ig_resnext101_32x32d,85.094,14.906,97.438,2.562,468.53,224,0.875,bilinear
 tf_efficientnet_b7,84.936,15.064,97.204,2.796,66.35,600,0.949,bicubic
@ -10,6 +16,7 @@ swsl_resnext101_32x8d,84.284,15.716,97.176,2.824,88.79,224,0.875,bilinear
 tf_efficientnet_b5_ap,84.252,15.748,96.974,3.026,30.39,456,0.934,bicubic
 ig_resnext101_32x16d,84.17,15.83,97.196,2.804,194.03,224,0.875,bilinear
 tf_efficientnet_b6,84.11,15.89,96.886,3.114,43.04,528,0.942,bicubic
 tf_efficientnet_b3_ns,84.048,15.952,96.91,3.09,12.23,300,0.904,bicubic
 tf_efficientnet_b5,83.812,16.188,96.748,3.252,30.39,456,0.934,bicubic
 swsl_resnext101_32x16d,83.346,16.654,96.846,3.154,194.03,224,0.875,bilinear
 tf_efficientnet_b4_ap,83.248,16.752,96.392,3.608,19.34,380,0.922,bicubic
@ -18,6 +25,7 @@ tf_efficientnet_b4,83.022,16.978,96.3,3.7,19.34,380,0.922,bicubic
 pnasnet5large,82.736,17.264,96.046,3.954,86.06,331,0.875,bicubic
 ig_resnext101_32x8d,82.688,17.312,96.636,3.364,88.79,224,0.875,bilinear
 nasnetalarge,82.554,17.446,96.038,3.962,88.75,331,0.875,bicubic
 tf_efficientnet_b2_ns,82.38,17.62,96.248,3.752,9.11,260,0.89,bicubic
 swsl_resnext50_32x4d,82.182,17.818,96.23,3.77,25.03,224,0.875,bilinear
 efficientnet_b3a,81.866,18.134,95.836,4.164,12.23,320,1,bicubic
 ssl_resnext101_32x16d,81.844,18.156,96.096,3.904,194.03,224,0.875,bilinear
@ -25,6 +33,7 @@ tf_efficientnet_b3_ap,81.822,18.178,95.624,4.376,12.23,300,0.904,bicubic
 tf_efficientnet_b3,81.636,18.364,95.718,4.282,12.23,300,0.904,bicubic
 ssl_resnext101_32x8d,81.616,18.384,96.038,3.962,88.79,224,0.875,bilinear
 efficientnet_b3,81.494,18.506,95.716,4.284,12.23,300,0.904,bicubic
 tf_efficientnet_b1_ns,81.388,18.612,95.738,4.262,7.79,240,0.882,bicubic
 senet154,81.31,18.69,95.496,4.504,115.09,224,0.875,bilinear
 gluon_senet154,81.234,18.766,95.348,4.652,115.09,224,0.875,bicubic
 swsl_resnet50,81.166,18.834,95.972,4.028,25.56,224,0.875,bilinear
@ -47,9 +56,10 @@ tf_efficientnet_b2_ap,80.3,19.7,95.028,4.972,9.11,260,0.89,bicubic
 seresnext101_32x4d,80.228,19.772,95.018,4.982,48.96,224,0.875,bilinear
 inception_v4,80.168,19.832,94.968,5.032,42.68,299,0.875,bicubic
 dpn107,80.156,19.844,94.91,5.09,86.92,224,0.875,bicubic
 skresnext50_32x4d,80.156,19.844,94.642,5.358,27.48,224,0.875,bicubic
 tf_efficientnet_b2,80.086,19.914,94.908,5.092,9.11,260,0.89,bicubic
 dpn92,80.008,19.992,94.836,5.164,37.67,224,0.875,bicubic
-ens_adv_inception_resnet_v2,79.982,20.018,94.938,5.062,55.84,299,0.8975,bicubic
+ens_adv_inception_resnet_v2,79.982,20.018,94.936,5.064,55.84,299,0.8975,bicubic
 gluon_seresnext50_32x4d,79.918,20.082,94.822,5.178,27.56,224,0.875,bicubic
 gluon_resnet152_v1c,79.91,20.09,94.84,5.16,60.21,224,0.875,bicubic
 dpn131,79.822,20.178,94.71,5.29,79.25,224,0.875,bicubic
@ -85,6 +95,7 @@ tf_efficientnet_em,78.708,21.292,94.314,5.686,6.9,240,0.882,bicubic
 efficientnet_b1,78.698,21.302,94.144,5.856,7.79,240,0.875,bicubic
 dla169,78.688,21.312,94.336,5.664,53.99,224,0.875,bilinear
 seresnet152,78.66,21.34,94.37,5.63,66.82,224,0.875,bilinear
 tf_efficientnet_b0_ns,78.658,21.342,94.376,5.624,5.29,224,0.875,bicubic
 res2net50_26w_6s,78.57,21.43,94.124,5.876,37.05,224,0.875,bilinear
 resnext50_32x4d,78.512,21.488,94.042,5.958,25.03,224,0.875,bicubic
 dla102x,78.51,21.49,94.228,5.772,26.77,224,0.875,bilinear
@ -99,6 +110,7 @@ dla60x,78.246,21.754,94.018,5.982,17.65,224,0.875,bilinear
 res2next50,78.246,21.754,93.892,6.108,24.67,224,0.875,bilinear
 hrnet_w30,78.206,21.794,94.222,5.778,37.71,224,0.875,bilinear
 res2net50_14w_8s,78.15,21.85,93.848,6.152,25.06,224,0.875,bilinear
 efficientnet_es,78.066,21.934,93.926,6.074,5.44,224,0.875,bicubic
 dla102,78.032,21.968,93.946,6.054,33.73,224,0.875,bilinear
 gluon_resnet50_v1c,78.012,21.988,93.988,6.012,25.58,224,0.875,bicubic
 seresnext26t_32x4d,77.998,22.002,93.708,6.292,16.82,224,0.875,bicubic
@ -115,7 +127,7 @@ adv_inception_v3,77.582,22.418,93.736,6.264,23.83,299,0.875,bicubic
 gluon_resnet50_v1b,77.58,22.42,93.716,6.284,25.56,224,0.875,bicubic
 res2net50_48w_2s,77.522,22.478,93.554,6.446,25.29,224,0.875,bilinear
 dpn68b,77.512,22.488,93.822,6.178,12.61,224,0.875,bicubic
-inception_v3,77.438,22.562,93.474,6.526,27.16,299,0.875,bicubic
+inception_v3,77.44,22.56,93.474,6.526,27.16,299,0.875,bicubic
 resnet101,77.374,22.626,93.54,6.46,44.55,224,0.875,bilinear
 densenet161,77.358,22.642,93.638,6.362,28.68,224,0.875,bicubic
 tf_efficientnet_cc_b0_4e,77.306,22.694,93.334,6.666,13.31,224,0.875,bicubic
@ -151,11 +163,12 @@ spnasnet_100,74.084,25.916,91.818,8.182,4.42,224,0.875,bilinear
 tf_mobilenetv3_large_075,73.438,26.562,91.35,8.65,3.99,224,0.875,bilinear
 tv_resnet34,73.312,26.688,91.426,8.574,21.8,224,0.875,bilinear
 swsl_resnet18,73.276,26.724,91.734,8.266,11.69,224,0.875,bilinear
 skresnet18,73.038,26.962,91.168,8.832,11.96,224,0.875,bicubic
 ssl_resnet18,72.61,27.39,91.416,8.584,11.69,224,0.875,bilinear
 hrnet_w18_small,72.342,27.658,90.678,9.322,13.19,224,0.875,bilinear
 tf_mobilenetv3_large_minimal_100,72.248,27.752,90.63,9.37,3.92,224,0.875,bilinear
 seresnet18,71.742,28.258,90.334,9.666,11.78,224,0.875,bicubic
-gluon_resnet18_v1b,70.836,29.164,89.76,10.24,11.69,224,0.875,bicubic
+gluon_resnet18_v1b,70.836,29.164,89.762,10.238,11.69,224,0.875,bicubic
 resnet18,69.748,30.252,89.078,10.922,11.69,224,0.875,bilinear
 tf_mobilenetv3_small_100,67.922,32.078,87.664,12.336,2.54,224,0.875,bilinear
 dla60x_c,67.892,32.108,88.426,11.574,1.34,224,0.875,bilinear
--- a/results/results-imagenetv2-matched-frequency.csv
+++ b/results/results-imagenetv2-matched-frequency.csv
@ -1,9 +1,15 @@
 model,top1,top1_err,top5,top5_err,param_count,img_size,cropt_pct,interpolation
 tf_efficientnet_l2_ns_475,80.46,19.54,95.73,4.27,480.31,475,0.936,bicubic
 tf_efficientnet_l2_ns,80.25,19.75,95.84,4.16,480.31,800,0.96,bicubic
 tf_efficientnet_b7_ns,78.51,21.49,94.38,5.62,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ns,77.28,22.72,93.89,6.11,43.04,528,0.942,bicubic
 ig_resnext101_32x48d,76.87,23.13,93.31,6.69,828.41,224,0.875,bilinear
 ig_resnext101_32x32d,76.84,23.16,93.2,6.8,468.53,224,0.875,bilinear
 tf_efficientnet_b5_ns,76.81,23.19,93.58,6.42,30.39,456,0.934,bicubic
 tf_efficientnet_b7_ap,76.09,23.91,92.97,7.03,66.35,600,0.949,bicubic
 tf_efficientnet_b8_ap,76.09,23.91,92.73,7.27,87.41,672,0.954,bicubic
 ig_resnext101_32x16d,75.72,24.28,92.91,7.09,194.03,224,0.875,bilinear
 tf_efficientnet_b4_ns,75.67,24.33,93.05,6.95,19.34,380,0.922,bicubic
 swsl_resnext101_32x8d,75.43,24.57,92.76,7.24,88.79,224,0.875,bilinear
 tf_efficientnet_b6_ap,75.38,24.62,92.44,7.56,43.04,528,0.942,bicubic
 tf_efficientnet_b8,74.94,25.06,92.31,7.69,87.41,672,0.954,bicubic
@ -12,6 +18,7 @@ tf_efficientnet_b5_ap,74.6,25.4,91.99,8.01,30.39,456,0.934,bicubic
 swsl_resnext101_32x4d,74.14,25.86,91.99,8.01,44.18,224,0.875,bilinear
 swsl_resnext101_32x16d,74.02,25.98,92.16,7.84,194.03,224,0.875,bilinear
 tf_efficientnet_b6,73.9,26.1,91.75,8.25,43.04,528,0.942,bicubic
 tf_efficientnet_b3_ns,73.89,26.11,91.87,8.13,12.23,300,0.904,bicubic
 ig_resnext101_32x8d,73.65,26.35,92.19,7.81,88.79,224,0.875,bilinear
 tf_efficientnet_b5,73.55,26.45,91.46,8.54,30.39,456,0.934,bicubic
 tf_efficientnet_b4_ap,72.89,27.11,90.98,9.02,19.34,380,0.922,bicubic
@ -19,11 +26,13 @@ swsl_resnext50_32x4d,72.56,27.44,90.87,9.13,25.03,224,0.875,bilinear
 pnasnet5large,72.38,27.62,90.24,9.76,86.06,331,0.875,bicubic
 nasnetalarge,72.32,27.68,90.53,9.47,88.75,331,0.875,bicubic
 tf_efficientnet_b4,72.29,27.71,90.59,9.41,19.34,380,0.922,bicubic
 tf_efficientnet_b2_ns,72.28,27.72,91.09,8.91,9.11,260,0.89,bicubic
 swsl_resnet50,71.7,28.3,90.5,9.5,25.56,224,0.875,bilinear
 ssl_resnext101_32x8d,71.5,28.5,90.46,9.54,88.79,224,0.875,bilinear
 ssl_resnext101_32x16d,71.41,28.59,90.56,9.44,194.03,224,0.875,bilinear
 tf_efficientnet_b3_ap,70.92,29.08,89.43,10.57,12.23,300,0.904,bicubic
 efficientnet_b3a,70.87,29.13,89.72,10.28,12.23,320,1.0,bicubic
 tf_efficientnet_b1_ns,70.87,29.13,90.12,9.88,7.79,240,0.882,bicubic
 efficientnet_b3,70.76,29.24,89.85,10.15,12.23,300,0.904,bicubic
 tf_efficientnet_b3,70.64,29.36,89.44,10.56,12.23,300,0.904,bicubic
 gluon_senet154,70.6,29.4,88.92,11.08,115.09,224,0.875,bicubic
@ -31,7 +40,7 @@ ssl_resnext101_32x4d,70.53,29.47,89.76,10.24,44.18,224,0.875,bilinear
 senet154,70.5,29.5,89.01,10.99,115.09,224,0.875,bilinear
 gluon_seresnext101_64x4d,70.43,29.57,89.35,10.65,88.23,224,0.875,bicubic
 gluon_resnet152_v1s,70.29,29.71,88.85,11.15,60.32,224,0.875,bicubic
-inception_resnet_v2,70.12,29.88,88.69,11.31,55.84,299,0.8975,bicubic
+inception_resnet_v2,70.12,29.88,88.7,11.3,55.84,299,0.8975,bicubic
 gluon_seresnext101_32x4d,70.01,29.99,88.9,11.1,48.96,224,0.875,bicubic
 gluon_resnet152_v1d,69.96,30.04,88.49,11.51,60.21,224,0.875,bicubic
 ssl_resnext50_32x4d,69.71,30.29,89.44,10.56,25.03,224,0.875,bilinear
@ -57,6 +66,7 @@ gluon_seresnext50_32x4d,68.67,31.33,88.31,11.69,27.56,224,0.875,bicubic
 hrnet_w64,68.64,31.36,88.05,11.95,128.06,224,0.875,bilinear
 dpn98,68.59,31.41,87.68,12.32,61.57,224,0.875,bicubic
 ssl_resnet50,68.41,31.59,88.56,11.44,25.56,224,0.875,bilinear
 skresnext50_32x4d,68.35,31.65,87.57,12.43,27.48,224,0.875,bicubic
 dla102x2,68.33,31.67,87.89,12.11,41.75,224,0.875,bilinear
 gluon_resnext50_32x4d,68.31,31.69,87.3,12.7,25.03,224,0.875,bicubic
 tf_efficientnet_el,68.18,31.82,88.35,11.65,10.59,300,0.904,bicubic
@ -66,6 +76,7 @@ resnext101_32x8d,67.86,32.14,87.49,12.51,88.79,224,0.875,bilinear
 seresnext50_32x4d,67.84,32.16,87.62,12.38,27.56,224,0.875,bilinear
 hrnet_w48,67.77,32.23,87.42,12.58,77.47,224,0.875,bilinear
 hrnet_w44,67.74,32.26,87.56,12.44,67.06,224,0.875,bilinear
 tf_efficientnet_b0_ns,67.71,32.29,88.07,11.93,5.29,224,0.875,bicubic
 xception,67.65,32.35,87.57,12.43,22.86,299,0.8975,bicubic
 dla169,67.61,32.39,87.59,12.41,53.99,224,0.875,bilinear
 gluon_inception_v3,67.59,32.41,87.47,12.53,23.83,299,0.875,bicubic
@ -87,6 +98,7 @@ dla60_res2net,67.02,32.98,87.16,12.84,21.15,224,0.875,bilinear
 dla102x,67.01,32.99,86.77,13.23,26.77,224,0.875,bilinear
 mixnet_l,66.94,33.06,86.91,13.09,7.33,224,0.875,bicubic
 res2net50_26w_6s,66.91,33.09,86.86,13.14,37.05,224,0.875,bilinear
 efficientnet_es,66.88,33.12,86.73,13.27,5.44,224,0.875,bicubic
 tf_efficientnet_b1,66.88,33.12,87.01,12.99,7.79,240,0.882,bicubic
 tf_efficientnet_em,66.88,33.12,86.97,13.03,6.9,240,0.882,bicubic
 resnext50_32x4d,66.87,33.13,86.34,13.66,25.03,224,0.875,bicubic
@ -95,7 +107,7 @@ tf_mixnet_l,66.78,33.22,86.47,13.53,7.33,224,0.875,bicubic
 selecsls60b,66.76,33.24,86.53,13.47,32.77,224,0.875,bicubic
 hrnet_w32,66.75,33.25,87.3,12.7,41.23,224,0.875,bilinear
 wide_resnet101_2,66.73,33.27,87.03,12.97,126.89,224,0.875,bilinear
-adv_inception_v3,66.65,33.35,86.53,13.47,23.83,299,0.875,bicubic
+adv_inception_v3,66.65,33.35,86.54,13.46,23.83,299,0.875,bicubic
 wide_resnet50_2,66.65,33.35,86.8,13.2,68.88,224,0.875,bilinear
 dla60_res2next,66.64,33.36,87.03,12.97,17.33,224,0.875,bilinear
 gluon_resnet50_v1c,66.56,33.44,86.18,13.82,25.58,224,0.875,bicubic
@ -106,7 +118,7 @@ seresnet50,66.25,33.75,86.33,13.67,28.09,224,0.875,bilinear
 selecsls60,66.21,33.79,86.34,13.66,30.67,224,0.875,bicubic
 tv_resnext50_32x4d,66.18,33.82,86.04,13.96,25.03,224,0.875,bilinear
 tf_efficientnet_cc_b0_8e,66.17,33.83,86.24,13.76,24.01,224,0.875,bicubic
-inception_v3,66.15,33.85,86.33,13.67,27.16,299,0.875,bicubic
+inception_v3,66.16,33.84,86.32,13.68,27.16,299,0.875,bicubic
 res2net50_26w_4s,66.14,33.86,86.6,13.4,25.7,224,0.875,bilinear
 gluon_resnet50_v1b,66.07,33.93,86.26,13.74,25.56,224,0.875,bicubic
 res2net50_14w_8s,66.02,33.98,86.25,13.75,25.06,224,0.875,bilinear
@ -151,6 +163,7 @@ mnasnet_100,61.9,38.1,83.71,16.29,4.38,224,0.875,bicubic
 ssl_resnet18,61.48,38.52,83.3,16.7,11.69,224,0.875,bilinear
 spnasnet_100,61.22,38.78,82.79,17.21,4.42,224,0.875,bilinear
 tv_resnet34,61.19,38.81,82.71,17.29,21.8,224,0.875,bilinear
 skresnet18,60.86,39.14,82.88,17.12,11.96,224,0.875,bicubic
 tf_mobilenetv3_large_075,60.4,39.6,81.95,18.05,3.99,224,0.875,bilinear
 seresnet18,59.8,40.2,81.69,18.31,11.78,224,0.875,bicubic
 tf_mobilenetv3_large_minimal_100,59.07,40.93,81.15,18.85,3.92,224,0.875,bilinear
--- a/results/results-sketch.csv
+++ b/results/results-sketch.csv
@ -6,19 +6,28 @@ swsl_resnext101_32x16d,57.4584,42.5416,80.3848,19.6152,194.03,224,0.875,bilinear
 swsl_resnext101_32x8d,56.4385,43.5615,78.9444,21.0556,88.79,224,0.875,bilinear
 ig_resnext101_32x8d,54.9176,45.0824,77.5335,22.4665,88.79,224,0.875,bilinear
 swsl_resnext101_32x4d,53.6029,46.3971,76.3466,23.6534,44.18,224,0.875,bilinear
 tf_efficientnet_l2_ns_475,51.4944,48.5056,73.9276,26.0724,480.31,475,0.936,bicubic
 swsl_resnext50_32x4d,50.4372,49.5628,73.3675,26.6325,25.03,224,0.875,bilinear
 swsl_resnet50,49.5412,50.4588,72.3339,27.6661,25.56,224,0.875,bilinear
 tf_efficientnet_b7_ns,47.8001,52.1999,69.6398,30.3602,66.35,600,0.949,bicubic
 tf_efficientnet_b6_ns,47.7608,52.2392,69.968,30.032,43.04,528,0.942,bicubic
 tf_efficientnet_l2_ns,47.6311,52.3689,70.0033,29.9967,480.31,800,0.961,bicubic
 tf_efficientnet_b8_ap,45.7741,54.2259,67.9106,32.0894,87.41,672,0.954,bicubic
-tf_efficientnet_b8,42.508,57.492,64.857,35.143,87.41,672,0.954,bicubic
+tf_efficientnet_b5_ns,45.615,54.385,67.8418,32.1582,30.39,456,0.934,bicubic
 tf_efficientnet_b4_ns,43.4495,56.5505,65.5191,34.4809,19.34,380,0.922,bicubic
 tf_efficientnet_b8,42.5082,57.4918,64.8568,35.1432,87.41,672,0.954,bicubic
 tf_efficientnet_b7,41.4314,58.5686,63.0175,36.9825,66.35,600,0.949,bicubic
 tf_efficientnet_b7_ap,41.4294,58.5706,62.8741,37.1259,66.35,600,0.949,bicubic
 tf_efficientnet_b5_ap,41.4176,58.5824,62.0841,37.9159,30.39,456,0.934,bicubic
 tf_efficientnet_b6_ap,41.0993,58.9007,62.3553,37.6447,43.04,528,0.942,bicubic
 tf_efficientnet_b4_ap,40.4842,59.5158,61.7226,38.2774,19.34,380,0.922,bicubic
 tf_efficientnet_b3_ns,39.5842,60.4158,61.4534,38.5466,12.23,300,0.904,bicubic
 tf_efficientnet_b5,38.356,61.644,59.9128,40.0872,30.39,456,0.934,bicubic
 tf_efficientnet_b3_ap,37.0552,62.9448,57.2403,42.7597,12.23,300,0.904,bicubic
 tf_efficientnet_b2_ns,36.1827,63.8173,57.5507,42.4493,9.11,260,0.89,bicubic
 swsl_resnet18,35.8584,64.1416,58.4547,41.5453,11.69,224,0.875,bilinear
-ssl_resnext101_32x16d,34.6028,65.3972,55.9315,44.0685,194.03,224,0.875,bilinear
+ssl_resnext101_32x16d,34.6047,65.3953,55.9315,44.0685,194.03,224,0.875,bilinear
 tf_efficientnet_b1_ns,34.1567,65.8433,55.4894,44.5106,7.79,240,0.882,bicubic
 tf_efficientnet_b4,34.0643,65.9357,54.1984,45.8016,19.34,380,0.922,bicubic
 ssl_resnext101_32x8d,34.0172,65.9828,55.6014,44.3986,88.79,224,0.875,bilinear
 tf_efficientnet_b6,33.9975,66.0025,54.5442,45.4558,43.04,528,0.942,bicubic
@ -28,7 +37,7 @@ gluon_resnet152_v1d,32.734,67.266,51.0877,48.9123,60.21,224,0.875,bicubic
 tf_efficientnet_b2_ap,32.6809,67.3191,52.2392,47.7608,9.11,260,0.89,bicubic
 nasnetalarge,32.5964,67.4036,49.7789,50.2211,88.75,331,0.875,bicubic
 pnasnet5large,32.5296,67.4704,50.1916,49.8084,86.06,331,0.875,bicubic
-ens_adv_inception_resnet_v2,32.3724,67.6276,50.4274,49.5726,55.84,299,0.8975,bicubic
+ens_adv_inception_resnet_v2,32.3705,67.6295,50.4274,49.5726,55.84,299,0.8975,bicubic
 gluon_resnet152_v1s,32.3312,67.6688,50.5257,49.4743,60.32,224,0.875,bicubic
 gluon_seresnext101_64x4d,32.2054,67.7946,50.3193,49.6807,88.23,224,0.875,bicubic
 gluon_seresnext101_32x4d,32.1071,67.8929,51.237,48.763,48.96,224,0.875,bicubic
@ -56,23 +65,25 @@ senet154,30.0006,69.9994,48.034,51.966,115.09,224,0.875,bilinear
 dpn92,29.9534,70.0466,49.1619,50.8381,37.67,224,0.875,bicubic
 gluon_senet154,29.8768,70.1232,47.8944,52.1056,115.09,224,0.875,bicubic
 xception,29.865,70.135,48.6864,51.3136,22.86,299,0.8975,bicubic
-adv_inception_v3,29.8178,70.1822,47.8473,52.1527,23.83,299,0.875,bicubic
+adv_inception_v3,29.8159,70.1841,47.8473,52.1527,23.83,299,0.875,bicubic
 efficientnet_b2,29.6154,70.3846,48.7767,51.2233,9.11,260,0.875,bicubic
 gluon_xception65,29.5506,70.4494,47.5054,52.4946,39.92,299,0.875,bicubic
 resnext101_32x8d,29.4386,70.5614,48.4859,51.5141,88.79,224,0.875,bilinear
 ssl_resnet50,29.4229,70.5771,49.7809,50.2191,25.56,224,0.875,bilinear
-gluon_inception_v3,29.1242,70.8758,46.9591,53.0409,23.83,299,0.875,bicubic
+gluon_inception_v3,29.1242,70.8758,46.9571,53.0429,23.83,299,0.875,bicubic
 hrnet_w64,28.9886,71.0114,47.1418,52.8582,128.06,224,0.875,bilinear
 tf_efficientnet_b0_ns,28.9021,71.0979,49.0106,50.9894,5.29,224,0.875,bicubic
 tf_efficientnet_b1,28.8864,71.1136,47.5034,52.4966,7.79,240,0.882,bicubic
 gluon_resnet101_v1b,28.8785,71.1215,46.3892,53.6108,44.55,224,0.875,bicubic
 skresnext50_32x4d,28.8176,71.1824,46.4973,53.5027,27.48,224,0.875,bicubic
 gluon_seresnext50_32x4d,28.6506,71.3494,46.4364,53.5636,27.56,224,0.875,bicubic
 hrnet_w40,28.6408,71.3592,47.4543,52.5457,57.56,224,0.875,bilinear
 resnet152,28.5327,71.4673,47.1182,52.8818,60.19,224,0.875,bilinear
 hrnet_w48,28.4128,71.5872,47.5859,52.4141,77.47,224,0.875,bilinear
 gluon_resnext50_32x4d,28.3755,71.6245,45.3281,54.6719,25.03,224,0.875,bicubic
 tf_efficientnet_b0_ap,28.346,71.654,47.5309,52.4691,5.29,224,0.875,bicubic
 dla102x2,28.3146,71.6854,46.7606,53.2394,41.75,224,0.875,bilinear
 tf_efficientnet_cc_b0_4e,28.3146,71.6854,47.3639,52.6361,13.31,224,0.875,bicubic
 dla102x2,28.3126,71.6874,46.7606,53.2394,41.75,224,0.875,bilinear
 dla169,28.3126,71.6874,47.3914,52.6086,53.99,224,0.875,bilinear
 mixnet_xl,28.2871,71.7129,46.7016,53.2984,11.9,224,0.875,bicubic
 gluon_resnet50_v1d,28.2458,71.7542,45.8783,54.1217,25.58,224,0.875,bicubic
@ -80,22 +91,23 @@ wide_resnet101_2,28.1082,71.8918,46.401,53.599,126.89,224,0.875,bilinear
 gluon_resnet101_v1c,28.1043,71.8957,45.9608,54.0392,44.57,224,0.875,bicubic
 densenet161,28.0807,71.9193,46.6407,53.3593,28.68,224,0.875,bicubic
 dpn68b,27.8842,72.1158,47.468,52.532,12.61,224,0.875,bicubic
-tf_inception_v3,27.7801,72.2199,45.7211,54.2789,23.83,299,0.875,bicubic
+tf_inception_v3,27.782,72.218,45.7191,54.2809,23.83,299,0.875,bicubic
 res2net101_26w_4s,27.7683,72.2317,45.1787,54.8213,45.21,224,0.875,bilinear
 hrnet_w44,27.6209,72.3791,45.837,54.163,67.06,224,0.875,bilinear
-inception_v3,27.5561,72.4439,45.2652,54.7348,27.16,299,0.875,bicubic
+inception_v3,27.5561,72.4439,45.2632,54.7368,27.16,299,0.875,bicubic
 hrnet_w30,27.3812,72.6188,46.5543,53.4457,37.71,224,0.875,bilinear
 hrnet_w32,27.3694,72.6306,45.9942,54.0058,41.23,224,0.875,bilinear
 gluon_resnet50_v1s,27.3261,72.6739,45.222,54.778,25.68,224,0.875,bicubic
 densenet201,27.2652,72.7348,46.2222,53.7778,20.01,224,0.875,bicubic
 res2net50_26w_8s,27.0785,72.9215,44.4281,55.5719,48.4,224,0.875,bilinear
-dla102x,27.0609,72.9391,45.4754,54.5246,26.77,224,0.875,bilinear
+dla102x,27.0609,72.9391,45.4735,54.5265,26.77,224,0.875,bilinear
 resnet101,26.9626,73.0374,45.2337,54.7663,44.55,224,0.875,bilinear
 resnext50d_32x4d,26.8761,73.1239,44.4359,55.5641,25.05,224,0.875,bicubic
 densenet169,26.829,73.171,45.3733,54.6267,14.15,224,0.875,bicubic
 seresnext101_32x4d,26.8113,73.1887,43.4966,56.5034,48.96,224,0.875,bilinear
 seresnet152,26.6757,73.3243,43.9466,56.0534,66.82,224,0.875,bilinear
 tf_efficientnet_el,26.6226,73.3774,44.6482,55.3518,10.59,300,0.904,bicubic
 efficientnet_es,26.6207,73.3793,45.1119,54.8881,5.44,224,0.875,bicubic
 res2net50_26w_6s,26.5951,73.4049,43.9899,56.0101,37.05,224,0.875,bilinear
 dla60x,26.5519,73.4481,45.0235,54.9765,17.65,224,0.875,bilinear
 tf_efficientnet_b0,26.4851,73.5149,45.6464,54.3536,5.29,224,0.875,bicubic
@ -125,12 +137,13 @@ efficientnet_b0,25.0152,74.9848,42.7872,57.2128,5.29,224,0.875,bicubic
 gluon_resnet34_v1b,24.9386,75.0614,42.2429,57.7571,21.8,224,0.875,bicubic
 dla60,24.9327,75.0673,43.2962,56.7038,22.33,224,0.875,bilinear
 tf_efficientnet_em,24.5416,75.4584,42.4119,57.5881,6.9,240,0.882,bicubic
 skresnet18,24.4827,75.5173,42.5357,57.4643,11.96,224,0.875,bicubic
 tv_resnet50,24.07,75.93,41.3134,58.6866,25.56,224,0.875,bilinear
 seresnet34,24.0268,75.9732,41.9089,58.0911,21.96,224,0.875,bilinear
 densenet121,23.8441,76.1559,41.9246,58.0754,7.98,224,0.875,bicubic
 tf_efficientnet_es,23.8185,76.1815,41.3311,58.6689,5.44,224,0.875,bicubic
 mixnet_m,23.7104,76.2896,41.1405,58.8595,5.01,224,0.875,bicubic
-dla34,23.6692,76.3308,41.5512,58.4488,15.78,224,0.875,bilinear
+dla34,23.6692,76.3308,41.5532,58.4468,15.78,224,0.875,bilinear
 seresnet50,23.6515,76.3485,40.0912,59.9088,28.09,224,0.875,bilinear
 tf_mixnet_m,23.4844,76.5156,40.9892,59.0108,5.01,224,0.875,bicubic
 tv_resnet34,23.4727,76.5273,41.3665,58.6335,21.8,224,0.875,bilinear
--- a/sotabench.py
+++ b/sotabench.py
@ -56,8 +56,7 @@ model_list = [
           model_desc='Trained from scratch in PyTorch w/ RandAugment'),
    _entry('efficientnet_es', 'EfficientNet-EdgeTPU-S', '1905.11946',
           model_desc='Trained from scratch in PyTorch w/ RandAugment'),
-    _entry('fbnetc_100', 'FBNet-C', '1812.03443',
+
           model_desc='Trained in PyTorch with RMSProp, exponential LR decay'),
    _entry('gluon_inception_v3', 'Inception V3', '1512.00567', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_resnet18_v1b', 'ResNet-18', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_resnet34_v1b', 'ResNet-34', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
@ -82,14 +81,22 @@ model_list = [
    _entry('gluon_seresnext101_64x4d', 'SE-ResNeXt-101 64x4d', '1812.01187', model_desc='Ported from GluonCV Model Zoo'),
    _entry('gluon_xception65', 'Modified Aligned Xception', '1802.02611', batch_size=BATCH_SIZE//2,
           model_desc='Ported from GluonCV Model Zoo'),
    _entry('mixnet_xl', 'MixNet-XL', '1907.09595', model_desc="My own scaling beyond paper's MixNet Large"),
    _entry('mixnet_l', 'MixNet-L', '1907.09595'),
    _entry('mixnet_m', 'MixNet-M', '1907.09595'),
    _entry('mixnet_s', 'MixNet-S', '1907.09595'),
    _entry('fbnetc_100', 'FBNet-C', '1812.03443',
           model_desc='Trained in PyTorch with RMSProp, exponential LR decay'),
    _entry('mnasnet_100', 'MnasNet-B1', '1807.11626'),
    _entry('semnasnet_100', 'MnasNet-A1', '1807.11626'),
    _entry('spnasnet_100', 'Single-Path NAS', '1904.02877',
           model_desc='Trained in PyTorch with SGD, cosine LR decay'),
    _entry('mobilenetv3_rw', 'MobileNet V3-Large 1.0', '1905.02244',
           model_desc='Trained in PyTorch with RMSProp, exponential LR decay, and hyper-params matching '
                      'paper as closely as possible.'),
    _entry('resnet18', 'ResNet-18', '1812.01187'),
    _entry('resnet26', 'ResNet-26', '1812.01187', model_desc='Block cfg of ResNet-34 w/ Bottleneck'),
    _entry('resnet26d', 'ResNet-26-D', '1812.01187',
@ -103,7 +110,7 @@ model_list = [
    _entry('resnext50d_32x4d', 'ResNeXt-50-D 32x4d', '1812.01187',
           model_desc="'D' variant (3x3 deep stem w/ avg-pool downscale). Trained with "
                      "SGD w/ cosine LR decay, random-erasing (gaussian per-pixel noise) and label-smoothing"),
-    _entry('semnasnet_100', 'MnasNet-A1', '1807.11626'),
+
    _entry('seresnet18', 'SE-ResNet-18', '1709.01507'),
    _entry('seresnet34', 'SE-ResNet-34', '1709.01507'),
    _entry('seresnext26_32x4d', 'SE-ResNeXt-26 32x4d', '1709.01507',
@ -114,8 +121,9 @@ model_list = [
           model_desc='Block cfg of SE-ResNeXt-34 w/ Bottleneck, deep tiered stem, and avg-pool in downsample layers.'),
    _entry('seresnext26tn_32x4d', 'SE-ResNeXt-26-TN 32x4d', '1812.01187',
           model_desc='Block cfg of SE-ResNeXt-34 w/ Bottleneck, deep tiered narrow stem, and avg-pool in downsample layers.'),
-    _entry('spnasnet_100', 'Single-Path NAS', '1904.02877',
+
-           model_desc='Trained in PyTorch with SGD, cosine LR decay'),
+    _entry('skresnet18', 'SK-ResNet-18', '1903.06586'),
    _entry('skresnext50_32x4d', 'SKNet-50', '1903.06586'),
    _entry('tf_efficientnet_b0', 'EfficientNet-B0 (AutoAugment)', '1905.11946',
           model_desc='Ported from official Google AI Tensorflow weights'),
--- a/timm/models/init.py
+++ b/timm/models/init.py
@ -16,9 +16,10 @@ from .gluon_xception import *
 from .res2net import *
 from .dla import *
 from .hrnet import *
 from .sknet import *
 from .registry import *
 from .factory import create_model
 from .helpers import load_checkpoint, resume_checkpoint
-from .test_time_pool import TestTimePoolHead, apply_test_time_pool
+from .layers import TestTimePoolHead, apply_test_time_pool
-from .split_batchnorm import convert_splitbn_model
+from .layers import convert_splitbn_model
--- a/timm/models/conv2d_layers.py
+++ b/timm/models/conv2d_layers.py
@ -1,260 +0,0 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch._six import container_abcs
 from itertools import repeat
 from functools import partial
 import numpy as np
 import math
 # Tuple helpers ripped from PyTorch
 def _ntuple(n):
    def parse(x):
        if isinstance(x, container_abcs.Iterable):
            return x
        return tuple(repeat(x, n))
    return parse
 _single = _ntuple(1)
 _pair = _ntuple(2)
 _triple = _ntuple(3)
 _quadruple = _ntuple(4)
 def _is_static_pad(kernel_size, stride=1, dilation=1, **_):
    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
 def _get_padding(kernel_size, stride=1, dilation=1, **_):
    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    return padding
 def _calc_same_pad(i, k, s, d):
    return max((math.ceil(i / s) - 1) * s + (k - 1) * d + 1 - i, 0)
 def _split_channels(num_chan, num_groups):
    split = [num_chan // num_groups for _ in range(num_groups)]
    split[0] += num_chan - sum(split)
    return split
 # pylint: disable=unused-argument
 def conv2d_same(x, weight, bias=None, stride=(1, 1), padding=(0, 0), dilation=(1, 1), groups=1):
    ih, iw = x.size()[-2:]
    kh, kw = weight.size()[-2:]
    pad_h = _calc_same_pad(ih, kh, stride[0], dilation[0])
    pad_w = _calc_same_pad(iw, kw, stride[1], dilation[1])
    if pad_h > 0 or pad_w > 0:
        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
 class Conv2dSame(nn.Conv2d):
    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
    """
    # pylint: disable=unused-argument
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True):
        super(Conv2dSame, self).__init__(
            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
    def forward(self, x):
        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
 def get_padding_value(padding, kernel_size, **kwargs):
    dynamic = False
    if isinstance(padding, str):
        # for any string padding, the padding will be calculated for you, one of three ways
        padding = padding.lower()
        if padding == 'same':
            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
            if _is_static_pad(kernel_size, **kwargs):
                # static case, no extra overhead
                padding = _get_padding(kernel_size, **kwargs)
            else:
                # dynamic 'SAME' padding, has runtime/GPU memory overhead
                padding = 0
                dynamic = True
        elif padding == 'valid':
            # 'VALID' padding, same as padding=0
            padding = 0
        else:
            # Default to PyTorch style 'same'-ish symmetric padding
            padding = _get_padding(kernel_size, **kwargs)
    return padding, dynamic
 def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
    padding = kwargs.pop('padding', '')
    kwargs.setdefault('bias', False)
    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
    if is_dynamic:
        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
    else:
        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
 class MixedConv2d(nn.Module):
    """ Mixed Grouped Convolution
    Based on MDConv and GroupedConv in MixNet impl:
      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
    NOTE: This does not currently work with torch.jit.script
    """
    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
        super(MixedConv2d, self).__init__()
        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
        num_groups = len(kernel_size)
        in_splits = _split_channels(in_channels, num_groups)
        out_splits = _split_channels(out_channels, num_groups)
        self.in_channels = sum(in_splits)
        self.out_channels = sum(out_splits)
        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
            conv_groups = out_ch if depthwise else 1
            # use add_module to keep key space clean
            self.add_module(
                str(idx),
                create_conv2d_pad(
                    in_ch, out_ch, k, stride=stride,
                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
            )
        self.splits = in_splits
    def forward(self, x):
        x_split = torch.split(x, self.splits, 1)
        x_out = [c(x) for x, c in zip(x_split, self._modules.values())]
        x = torch.cat(x_out, 1)
        return x
 def get_condconv_initializer(initializer, num_experts, expert_shape):
    def condconv_initializer(weight):
        """CondConv initializer function."""
        num_params = np.prod(expert_shape)
        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
                weight.shape[1] != num_params):
            raise (ValueError(
                'CondConv variables must have shape [num_experts, num_params]'))
        for i in range(num_experts):
            initializer(weight[i].view(expert_shape))
    return condconv_initializer
 class CondConv2d(nn.Module):
    """ Conditional Convolution
    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
    https://github.com/pytorch/pytorch/issues/17983
    """
    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
        super(CondConv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = _pair(kernel_size)
        self.stride = _pair(stride)
        padding_val, is_padding_dynamic = get_padding_value(
            padding, kernel_size, stride=stride, dilation=dilation)
        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
        self.padding = _pair(padding_val)
        self.dilation = _pair(dilation)
        self.groups = groups
        self.num_experts = num_experts
        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight_num_param = 1
        for wd in self.weight_shape:
            weight_num_param *= wd
        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
        if bias:
            self.bias_shape = (self.out_channels,)
            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self):
        init_weight = get_condconv_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
        init_weight(self.weight)
        if self.bias is not None:
            fan_in = np.prod(self.weight_shape[1:])
            bound = 1 / math.sqrt(fan_in)
            init_bias = get_condconv_initializer(
                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
            init_bias(self.bias)
    def forward(self, x, routing_weights):
        B, C, H, W = x.shape
        weight = torch.matmul(routing_weights, self.weight)
        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight = weight.view(new_weight_shape)
        bias = None
        if self.bias is not None:
            bias = torch.matmul(routing_weights, self.bias)
            bias = bias.view(B * self.out_channels)
        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
        x = x.view(1, B * C, H, W)
        if self.dynamic_padding:
            out = conv2d_same(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        else:
            out = F.conv2d(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
        # Literal port (from TF definition)
        # x = torch.split(x, 1, 0)
        # weight = torch.split(weight, 1, 0)
        # if self.bias is not None:
        #     bias = torch.matmul(routing_weights, self.bias)
        #     bias = torch.split(bias, 1, 0)
        # else:
        #     bias = [None] * B
        # out = []
        # for xi, wi, bi in zip(x, weight, bias):
        #     wi = wi.view(*self.weight_shape)
        #     if bi is not None:
        #         bi = bi.view(*self.bias_shape)
        #     out.append(self.conv_fn(
        #         xi, wi, bi, stride=self.stride, padding=self.padding,
        #         dilation=self.dilation, groups=self.groups))
        # out = torch.cat(out, 0)
        return out
 # helper method
 def select_conv2d(in_chs, out_chs, kernel_size, **kwargs):
    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
    if isinstance(kernel_size, list):
        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
        # We're going to use only lists for defining the MixedConv2d kernel groups,
        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
    else:
        depthwise = kwargs.pop('depthwise', False)
        groups = out_chs if depthwise else 1
        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
        else:
            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
    return m
--- a/timm/models/densenet.py
+++ b/timm/models/densenet.py
@ -10,7 +10,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 import re
--- a/timm/models/dla.py
+++ b/timm/models/dla.py
@ -13,7 +13,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
--- a/timm/models/dpn.py
+++ b/timm/models/dpn.py
@ -16,7 +16,7 @@ from collections import OrderedDict
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD
--- a/timm/models/efficientnet.py
+++ b/timm/models/efficientnet.py
@ -28,8 +28,8 @@ from .efficientnet_builder import *
 from .feature_hooks import FeatureHooks
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
-from .conv2d_layers import select_conv2d
+from timm.models.layers import create_conv2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
@ -194,7 +194,7 @@ default_cfgs = {
        input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936),
    'tf_efficientnet_l2_ns': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
-        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
+        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96),
    'tf_efficientnet_es': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
@ -253,7 +253,7 @@ class EfficientNet(nn.Module):
    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(EfficientNet, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -265,21 +265,21 @@ class EfficientNet(nn.Module):
        # Stem
        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
-            channel_multiplier, channel_divisor, channel_min, 32, pad_type, act_layer, se_kwargs,
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs
        # Head + Pooling
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
+        self.conv_head = create_conv2d(self._in_chs, self.num_features, 1, padding=pad_type)
        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
        self.act2 = act_layer(inplace=True)
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
@ -333,7 +333,7 @@ class EfficientNetFeatures(nn.Module):
    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
-                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 output_stride=32, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(EfficientNetFeatures, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -347,7 +347,7 @@ class EfficientNetFeatures(nn.Module):
        # Stem
        stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -355,7 +355,7 @@ class EfficientNetFeatures(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
        self._in_chs = builder.in_chs
@ -875,7 +875,7 @@ def spnasnet_100(pretrained=False, **kwargs):
@register_model
 def efficientnet_b0(pretrained=False, **kwargs):
    """ EfficientNet-B0 """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
    return model
@ -884,7 +884,7 @@ def efficientnet_b0(pretrained=False, **kwargs):
@register_model
 def efficientnet_b1(pretrained=False, **kwargs):
    """ EfficientNet-B1 """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
    return model
@ -893,7 +893,7 @@ def efficientnet_b1(pretrained=False, **kwargs):
@register_model
 def efficientnet_b2(pretrained=False, **kwargs):
    """ EfficientNet-B2 """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
    return model
@ -902,7 +902,7 @@ def efficientnet_b2(pretrained=False, **kwargs):
@register_model
 def efficientnet_b2a(pretrained=False, **kwargs):
    """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop"""
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b2a', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
    return model
@ -911,7 +911,7 @@ def efficientnet_b2a(pretrained=False, **kwargs):
@register_model
 def efficientnet_b3(pretrained=False, **kwargs):
    """ EfficientNet-B3 """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
    return model
@ -920,7 +920,7 @@ def efficientnet_b3(pretrained=False, **kwargs):
@register_model
 def efficientnet_b3a(pretrained=False, **kwargs):
    """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """
-    # NOTE for train, drop_rate should be 0.3, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b3a', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
    return model
@ -929,7 +929,7 @@ def efficientnet_b3a(pretrained=False, **kwargs):
@register_model
 def efficientnet_b4(pretrained=False, **kwargs):
    """ EfficientNet-B4 """
-    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
    return model
@ -938,7 +938,7 @@ def efficientnet_b4(pretrained=False, **kwargs):
@register_model
 def efficientnet_b5(pretrained=False, **kwargs):
    """ EfficientNet-B5 """
-    # NOTE for train, drop_rate should be 0.4, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
    return model
@ -947,7 +947,7 @@ def efficientnet_b5(pretrained=False, **kwargs):
@register_model
 def efficientnet_b6(pretrained=False, **kwargs):
    """ EfficientNet-B6 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
    return model
@ -956,7 +956,7 @@ def efficientnet_b6(pretrained=False, **kwargs):
@register_model
 def efficientnet_b7(pretrained=False, **kwargs):
    """ EfficientNet-B7 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
    return model
@ -965,7 +965,7 @@ def efficientnet_b7(pretrained=False, **kwargs):
@register_model
 def efficientnet_b8(pretrained=False, **kwargs):
    """ EfficientNet-B8 """
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
    return model
@ -974,7 +974,7 @@ def efficientnet_b8(pretrained=False, **kwargs):
@register_model
 def efficientnet_l2(pretrained=False, **kwargs):
    """ EfficientNet-L2."""
-    # NOTE for train, drop_rate should be 0.5, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
    model = _gen_efficientnet(
        'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
    return model
@ -1007,7 +1007,7 @@ def efficientnet_el(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
    return model
@ -1016,7 +1016,7 @@ def efficientnet_cc_b0_4e(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
        pretrained=pretrained, **kwargs)
@ -1025,7 +1025,7 @@ def efficientnet_cc_b0_8e(pretrained=False, **kwargs):
@register_model
 def efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    model = _gen_efficientnet_condconv(
        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
        pretrained=pretrained, **kwargs)
@ -1355,7 +1355,7 @@ def tf_efficientnet_el(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
@ -1366,7 +1366,7 @@ def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
@ -1377,7 +1377,7 @@ def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
@register_model
 def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
-    # NOTE for train, drop_rate should be 0.2, drop_connect_rate should be 0.2
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
    kwargs['pad_type'] = 'same'
    model = _gen_efficientnet_condconv(
--- a/timm/models/efficientnet_blocks.py
+++ b/timm/models/efficientnet_blocks.py
@ -1,11 +1,8 @@
 from functools import partial
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
+from torch.nn import functional as F
-from .activations import sigmoid
+from .layers.activations import sigmoid
-from .conv2d_layers import *
+from .layers import create_conv2d, drop_path
 # Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
@ -72,19 +69,6 @@ def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
    return make_divisible(channels, divisor, channel_min)
 def drop_connect(inputs, training=False, drop_connect_rate=0.):
    """Apply drop connect."""
    if not training:
        return inputs
    keep_prob = 1 - drop_connect_rate
    random_tensor = keep_prob + torch.rand(
        (inputs.size()[0], 1, 1, 1), dtype=inputs.dtype, device=inputs.device)
    random_tensor.floor_()  # binarize
    output = inputs.div(keep_prob) * random_tensor
    return output
 class ChannelShuffle(nn.Module):
    # FIXME haven't used yet
    def __init__(self, groups):
@ -132,7 +116,7 @@ class ConvBnAct(nn.Module):
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(ConvBnAct, self).__init__()
        norm_kwargs = norm_kwargs or {}
-        self.conv = select_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
        self.bn1 = norm_layer(out_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
@ -157,25 +141,27 @@ class DepthwiseSeparableConv(nn.Module):
    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0.):
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0.):
        super(DepthwiseSeparableConv, self).__init__()
        norm_kwargs = norm_kwargs or {}
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
        self.has_pw_act = pw_act  # activation after point-wise conv
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate
-        self.conv_dw = select_conv2d(
+        self.conv_dw = create_conv2d(
            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
        self.bn1 = norm_layer(in_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
        else:
            self.se = None
-        self.conv_pw = select_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
        self.bn2 = norm_layer(out_chs, **norm_kwargs)
        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
@ -193,7 +179,7 @@ class DepthwiseSeparableConv(nn.Module):
        x = self.bn1(x)
        x = self.act1(x)
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)
        x = self.conv_pw(x)
@ -201,8 +187,8 @@ class DepthwiseSeparableConv(nn.Module):
        x = self.act2(x)
        if self.has_residual:
-            if self.drop_connect_rate > 0.:
+            if self.drop_path_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x
@ -214,34 +200,36 @@ class InvertedResidual(nn.Module):
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 conv_kwargs=None, drop_connect_rate=0.):
+                 conv_kwargs=None, drop_path_rate=0.):
        super(InvertedResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
        conv_kwargs = conv_kwargs or {}
        mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate
        # Point-wise expansion
-        self.conv_pw = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        # Depth-wise convolution
-        self.conv_dw = select_conv2d(
+        self.conv_dw = create_conv2d(
            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
            padding=pad_type, depthwise=True, **conv_kwargs)
        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
        self.act2 = act_layer(inplace=True)
        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
        else:
            self.se = None
        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
        self.bn3 = norm_layer(out_chs, **norm_kwargs)
    def feature_module(self, location):
@ -269,7 +257,7 @@ class InvertedResidual(nn.Module):
        x = self.act2(x)
        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)
        # Point-wise linear projection
@ -277,8 +265,8 @@ class InvertedResidual(nn.Module):
        x = self.bn3(x)
        if self.has_residual:
-            if self.drop_connect_rate > 0.:
+            if self.drop_path_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x
@ -291,7 +279,7 @@ class CondConvResidual(InvertedResidual):
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 num_experts=0, drop_connect_rate=0.):
+                 num_experts=0, drop_path_rate=0.):
        self.num_experts = num_experts
        conv_kwargs = dict(num_experts=self.num_experts)
@ -301,7 +289,7 @@ class CondConvResidual(InvertedResidual):
            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
-            drop_connect_rate=drop_connect_rate)
+            drop_path_rate=drop_path_rate)
        self.routing_fn = nn.Linear(in_chs, self.num_experts)
@ -323,7 +311,7 @@ class CondConvResidual(InvertedResidual):
        x = self.act2(x)
        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)
        # Point-wise linear projection
@ -331,8 +319,8 @@ class CondConvResidual(InvertedResidual):
        x = self.bn3(x)
        if self.has_residual:
-            if self.drop_connect_rate > 0.:
+            if self.drop_path_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x
@ -343,29 +331,31 @@ class EdgeResidual(nn.Module):
    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
-                 drop_connect_rate=0.):
+                 drop_path_rate=0.):
        super(EdgeResidual, self).__init__()
        norm_kwargs = norm_kwargs or {}
        if fake_in_chs > 0:
            mid_chs = make_divisible(fake_in_chs * exp_ratio)
        else:
            mid_chs = make_divisible(in_chs * exp_ratio)
-        self.has_se = se_ratio is not None and se_ratio > 0.
+        has_se = se_ratio is not None and se_ratio > 0.
        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate
        # Expansion convolution
-        self.conv_exp = select_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+        self.conv_exp = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        # Squeeze-and-excitation
-        if self.has_se:
+        if has_se:
            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
        else:
            self.se = None
        # Point-wise linear projection
-        self.conv_pwl = select_conv2d(
+        self.conv_pwl = create_conv2d(
            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
        self.bn2 = norm_layer(out_chs, **norm_kwargs)
@ -389,7 +379,7 @@ class EdgeResidual(nn.Module):
        x = self.act1(x)
        # Squeeze-and-excitation
-        if self.has_se:
+        if self.se is not None:
            x = self.se(x)
        # Point-wise linear projection
@ -397,8 +387,8 @@ class EdgeResidual(nn.Module):
        x = self.bn2(x)
        if self.has_residual:
-            if self.drop_connect_rate > 0.:
+            if self.drop_path_rate > 0.:
-                x = drop_connect(x, self.training, self.drop_connect_rate)
+                x = drop_path(x, self.drop_path_rate, self.training)
            x += residual
        return x
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@ -5,7 +5,8 @@ from collections.__init__ import OrderedDict
 from copy import deepcopy
 import torch.nn as nn
-from .activations import sigmoid, HardSwish, Swish
+from .layers import CondConv2d, get_condconv_initializer
 from .layers.activations import HardSwish, Swish
 from .efficientnet_blocks import *
@ -201,7 +202,7 @@ class EfficientNetBuilder:
    """
    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
-                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_connect_rate=0., feature_location='',
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0., feature_location='',
                 verbose=False):
        self.channel_multiplier = channel_multiplier
        self.channel_divisor = channel_divisor
@ -212,7 +213,7 @@ class EfficientNetBuilder:
        self.se_kwargs = se_kwargs
        self.norm_layer = norm_layer
        self.norm_kwargs = norm_kwargs
-        self.drop_connect_rate = drop_connect_rate
+        self.drop_path_rate = drop_path_rate
        self.feature_location = feature_location
        assert feature_location in ('pre_pwl', 'post_exp', '')
        self.verbose = verbose
@ -225,7 +226,7 @@ class EfficientNetBuilder:
        return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
    def _make_block(self, ba, block_idx, block_count):
-        drop_connect_rate = self.drop_connect_rate * block_idx / block_count
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
        bt = ba.pop('block_type')
        ba['in_chs'] = self.in_chs
        ba['out_chs'] = self._round_channels(ba['out_chs'])
@ -239,7 +240,7 @@ class EfficientNetBuilder:
        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
        assert ba['act_layer'] is not None
        if bt == 'ir':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)))
@ -248,13 +249,13 @@ class EfficientNetBuilder:
            else:
                block = InvertedResidual(**ba)
        elif bt == 'ds' or bt == 'dsa':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)))
            block = DepthwiseSeparableConv(**ba)
        elif bt == 'er':
-            ba['drop_connect_rate'] = drop_connect_rate
+            ba['drop_path_rate'] = drop_path_rate
            ba['se_kwargs'] = self.se_kwargs
            if self.verbose:
                logging.info('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)))
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@ -31,7 +31,21 @@ def create_model(
        kwargs.pop('bn_tf', None)
        kwargs.pop('bn_momentum', None)
        kwargs.pop('bn_eps', None)
-        kwargs.pop('drop_connect_rate', None)
+
    # Parameters that aren't supported by all models should default to None in command line args,
    # remove them if they are present and not set so that non-supporting models don't break.
    if kwargs.get('drop_block_rate', None) is None:
        kwargs.pop('drop_block_rate', None)
    # handle backwards compat with drop_connect -> drop_path change
    drop_connect_rate = kwargs.pop('drop_connect_rate', None)
    if drop_connect_rate is not None and kwargs.get('drop_path_rate', None) is None:
        print("WARNING: 'drop_connect' as an argument is deprecated, please use 'drop_path'."
              " Setting drop_path to %f." % drop_connect_rate)
        kwargs['drop_path_rate'] = drop_connect_rate
    if kwargs.get('drop_path_rate', None) is None:
        kwargs.pop('drop_path_rate', None)
    if is_model(model_name):
        create_fn = model_entrypoint(model_name)
--- a/timm/models/gluon_resnet.py
+++ b/timm/models/gluon_resnet.py
@ -11,6 +11,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
 from .layers import SEModule
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .resnet import ResNet, Bottleneck, BasicBlock
@ -319,8 +320,8 @@ def gluon_seresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kw
    """
    default_cfg = default_cfgs['gluon_seresnext50_32x4d']
    model = ResNet(
-        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4, use_se=True,
+        Bottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -333,8 +334,8 @@ def gluon_seresnext101_32x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    """
    default_cfg = default_cfgs['gluon_seresnext101_32x4d']
    model = ResNet(
-        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4, use_se=True,
+        Bottleneck, [3, 4, 23, 3], cardinality=32, base_width=4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer=SEModule), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -346,9 +347,10 @@ def gluon_seresnext101_64x4d(pretrained=False, num_classes=1000, in_chans=3, **k
    """Constructs a SEResNeXt-101-64x4d model.
    """
    default_cfg = default_cfgs['gluon_seresnext101_64x4d']
    block_args = dict(attn_layer=SEModule)
    model = ResNet(
-        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4, use_se=True,
+        Bottleneck, [3, 4, 23, 3], cardinality=64, base_width=4,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -360,10 +362,10 @@ def gluon_senet154(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs an SENet-154 model.
    """
    default_cfg = default_cfgs['gluon_senet154']
    block_args = dict(attn_layer=SEModule)
    model = ResNet(
-        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, use_se=True,
+        Bottleneck, [3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep', down_kernel_size=3,
-        stem_type='deep', down_kernel_size=3, block_reduce_first=2,
+        block_reduce_first=2, num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
        num_classes=num_classes, in_chans=in_chans, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
--- a/timm/models/gluon_xception.py
+++ b/timm/models/gluon_xception.py
@ -13,7 +13,7 @@ from collections import OrderedDict
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 __all__ = ['Xception65', 'Xception71']
--- a/timm/models/hrnet.py
+++ b/timm/models/hrnet.py
@ -25,7 +25,7 @@ import torch.nn.functional as F
 from .resnet import BasicBlock, Bottleneck  # leveraging ResNet blocks w/ additional features like SE
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 _BN_MOMENTUM = 0.1
--- a/timm/models/inception_resnet_v2.py
+++ b/timm/models/inception_resnet_v2.py
@ -8,7 +8,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 __all__ = ['InceptionResnetV2']
--- a/timm/models/inception_v4.py
+++ b/timm/models/inception_v4.py
@ -8,7 +8,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
 __all__ = ['InceptionV4']
--- a/timm/models/layers/init.py
+++ b/timm/models/layers/init.py
@ -0,0 +1,17 @@
 from .padding import get_padding
 from .avg_pool2d_same import AvgPool2dSame
 from .conv2d_same import Conv2dSame
 from .conv_bn_act import ConvBnAct
 from .mixed_conv2d import MixedConv2d
 from .cond_conv2d import CondConv2d, get_condconv_initializer
 from .create_conv2d import create_conv2d
 from .create_attn import create_attn
 from .selective_kernel import SelectiveKernelConv
 from .se import SEModule
 from .eca import EcaModule, CecaModule
 from .activations import *
 from .adaptive_avgmax_pool import \
    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
 from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
 from .test_time_pool import TestTimePoolHead, apply_test_time_pool
 from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
--- a/timm/models/layers/activations.py
+++ b/timm/models/layers/activations.py
@ -1,3 +1,12 @@
 """ Activations
 A collection of activations fn and modules with a common interface so that they can
 easily be swapped. All have an `inplace` arg even if not used.
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
@ -66,20 +75,20 @@ if _USE_MEM_EFFICIENT_ISH:
        return MishJitAutoFn.apply(x)
 else:
-    def swish(x, inplace=False):
+    def swish(x, inplace: bool = False):
        """Swish - Described in: https://arxiv.org/abs/1710.05941
        """
        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
-    def mish(x, _inplace=False):
+    def mish(x, _inplace: bool = False):
        """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
        """
        return x.mul(F.softplus(x).tanh())
 class Swish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Swish, self).__init__()
        self.inplace = inplace
@ -88,7 +97,7 @@ class Swish(nn.Module):
 class Mish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Mish, self).__init__()
        self.inplace = inplace
@ -96,13 +105,13 @@ class Mish(nn.Module):
        return mish(x, self.inplace)
-def sigmoid(x, inplace=False):
+def sigmoid(x, inplace: bool = False):
    return x.sigmoid_() if inplace else x.sigmoid()
 # PyTorch has this, but not with a consistent inplace argmument interface
 class Sigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Sigmoid, self).__init__()
        self.inplace = inplace
@ -110,13 +119,13 @@ class Sigmoid(nn.Module):
        return x.sigmoid_() if self.inplace else x.sigmoid()
-def tanh(x, inplace=False):
+def tanh(x, inplace: bool = False):
    return x.tanh_() if inplace else x.tanh()
 # PyTorch has this, but not with a consistent inplace argmument interface
 class Tanh(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(Tanh, self).__init__()
        self.inplace = inplace
@ -124,13 +133,13 @@ class Tanh(nn.Module):
        return x.tanh_() if self.inplace else x.tanh()
-def hard_swish(x, inplace=False):
+def hard_swish(x, inplace: bool = False):
    inner = F.relu6(x + 3.).div_(6.)
    return x.mul_(inner) if inplace else x.mul(inner)
 class HardSwish(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(HardSwish, self).__init__()
        self.inplace = inplace
@ -138,7 +147,7 @@ class HardSwish(nn.Module):
        return hard_swish(x, self.inplace)
-def hard_sigmoid(x, inplace=False):
+def hard_sigmoid(x, inplace: bool = False):
    if inplace:
        return x.add_(3.).clamp_(0., 6.).div_(6.)
    else:
@ -146,7 +155,7 @@ def hard_sigmoid(x, inplace=False):
 class HardSigmoid(nn.Module):
-    def __init__(self, inplace=False):
+    def __init__(self, inplace: bool = False):
        super(HardSigmoid, self).__init__()
        self.inplace = inplace
--- a/timm/models/layers/adaptive_avgmax_pool.py
+++ b/timm/models/layers/adaptive_avgmax_pool.py
--- a/timm/models/layers/avg_pool2d_same.py
+++ b/timm/models/layers/avg_pool2d_same.py
@ -0,0 +1,31 @@
 """ AvgPool2d w/ Same Padding
 Hacked together by Ross Wightman
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import List
 import math
 from .helpers import tup_pair
 from .padding import pad_same
 def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
                    ceil_mode: bool = False, count_include_pad: bool = True):
    x = pad_same(x, kernel_size, stride)
    return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
 class AvgPool2dSame(nn.AvgPool2d):
    """ Tensorflow like 'SAME' wrapper for 2D average pooling
    """
    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
        kernel_size = tup_pair(kernel_size)
        stride = tup_pair(stride)
        super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
    def forward(self, x):
        return avg_pool2d_same(
            x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
--- a/timm/models/layers/cbam.py
+++ b/timm/models/layers/cbam.py
@ -0,0 +1,100 @@
 """ CBAM (sort-of) Attention
 Experimental impl of CBAM: Convolutional Block Attention Module: https://arxiv.org/abs/1807.06521
 WARNING: Results with these attention layers have been mixed. They can significantly reduce performance on
 some tasks, especially fine-grained it seems. I may end up removing this impl.
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from .conv_bn_act import ConvBnAct
 class ChannelAttn(nn.Module):
    """ Original CBAM channel attention module, currently avg + max pool variant only.
    """
    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
        super(ChannelAttn, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.fc1 = nn.Conv2d(channels, channels // reduction, 1, bias=False)
        self.act = act_layer(inplace=True)
        self.fc2 = nn.Conv2d(channels // reduction, channels, 1, bias=False)
    def forward(self, x):
        x_avg = self.avg_pool(x)
        x_max = self.max_pool(x)
        x_avg = self.fc2(self.act(self.fc1(x_avg)))
        x_max = self.fc2(self.act(self.fc1(x_max)))
        x_attn = x_avg + x_max
        return x * x_attn.sigmoid()
 class LightChannelAttn(ChannelAttn):
    """An experimental 'lightweight' that sums avg + max pool first
    """
    def __init__(self, channels, reduction=16):
        super(LightChannelAttn, self).__init__(channels, reduction)
    def forward(self, x):
        x_pool = 0.5 * self.avg_pool(x) + 0.5 * self.max_pool(x)
        x_attn = self.fc2(self.act(self.fc1(x_pool)))
        return x * x_attn.sigmoid()
 class SpatialAttn(nn.Module):
    """ Original CBAM spatial attention module
    """
    def __init__(self, kernel_size=7):
        super(SpatialAttn, self).__init__()
        self.conv = ConvBnAct(2, 1, kernel_size, act_layer=None)
    def forward(self, x):
        x_avg = torch.mean(x, dim=1, keepdim=True)
        x_max = torch.max(x, dim=1, keepdim=True)[0]
        x_attn = torch.cat([x_avg, x_max], dim=1)
        x_attn = self.conv(x_attn)
        return x * x_attn.sigmoid()
 class LightSpatialAttn(nn.Module):
    """An experimental 'lightweight' variant that sums avg_pool and max_pool results.
    """
    def __init__(self, kernel_size=7):
        super(LightSpatialAttn, self).__init__()
        self.conv = ConvBnAct(1, 1, kernel_size, act_layer=None)
    def forward(self, x):
        x_avg = torch.mean(x, dim=1, keepdim=True)
        x_max = torch.max(x, dim=1, keepdim=True)[0]
        x_attn = 0.5 * x_avg + 0.5 * x_max
        x_attn = self.conv(x_attn)
        return x * x_attn.sigmoid()
 class CbamModule(nn.Module):
    def __init__(self, channels, spatial_kernel_size=7):
        super(CbamModule, self).__init__()
        self.channel = ChannelAttn(channels)
        self.spatial = SpatialAttn(spatial_kernel_size)
    def forward(self, x):
        x = self.channel(x)
        x = self.spatial(x)
        return x
 class LightCbamModule(nn.Module):
    def __init__(self, channels, spatial_kernel_size=7):
        super(LightCbamModule, self).__init__()
        self.channel = LightChannelAttn(channels)
        self.spatial = LightSpatialAttn(spatial_kernel_size)
    def forward(self, x):
        x = self.channel(x)
        x = self.spatial(x)
        return x
--- a/timm/models/layers/cond_conv2d.py
+++ b/timm/models/layers/cond_conv2d.py
@ -0,0 +1,121 @@
 """ PyTorch Conditionally Parameterized Convolution (CondConv)
 Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
 (https://arxiv.org/abs/1904.04971)
 Hacked together by Ross Wightman
 """
 import math
 from functools import partial
 import numpy as np
 import torch
 from torch import nn as nn
 from torch.nn import functional as F
 from .helpers import tup_pair
 from .conv2d_same import get_padding_value, conv2d_same
 def get_condconv_initializer(initializer, num_experts, expert_shape):
    def condconv_initializer(weight):
        """CondConv initializer function."""
        num_params = np.prod(expert_shape)
        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
                weight.shape[1] != num_params):
            raise (ValueError(
                'CondConv variables must have shape [num_experts, num_params]'))
        for i in range(num_experts):
            initializer(weight[i].view(expert_shape))
    return condconv_initializer
 class CondConv2d(nn.Module):
    """ Conditionally Parameterized Convolution
    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
    https://github.com/pytorch/pytorch/issues/17983
    """
    __constants__ = ['bias', 'in_channels', 'out_channels', 'dynamic_padding']
    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
        super(CondConv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = tup_pair(kernel_size)
        self.stride = tup_pair(stride)
        padding_val, is_padding_dynamic = get_padding_value(
            padding, kernel_size, stride=stride, dilation=dilation)
        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
        self.padding = tup_pair(padding_val)
        self.dilation = tup_pair(dilation)
        self.groups = groups
        self.num_experts = num_experts
        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight_num_param = 1
        for wd in self.weight_shape:
            weight_num_param *= wd
        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
        if bias:
            self.bias_shape = (self.out_channels,)
            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self):
        init_weight = get_condconv_initializer(
            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
        init_weight(self.weight)
        if self.bias is not None:
            fan_in = np.prod(self.weight_shape[1:])
            bound = 1 / math.sqrt(fan_in)
            init_bias = get_condconv_initializer(
                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
            init_bias(self.bias)
    def forward(self, x, routing_weights):
        B, C, H, W = x.shape
        weight = torch.matmul(routing_weights, self.weight)
        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
        weight = weight.view(new_weight_shape)
        bias = None
        if self.bias is not None:
            bias = torch.matmul(routing_weights, self.bias)
            bias = bias.view(B * self.out_channels)
        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
        x = x.view(1, B * C, H, W)
        if self.dynamic_padding:
            out = conv2d_same(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        else:
            out = F.conv2d(
                x, weight, bias, stride=self.stride, padding=self.padding,
                dilation=self.dilation, groups=self.groups * B)
        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
        # Literal port (from TF definition)
        # x = torch.split(x, 1, 0)
        # weight = torch.split(weight, 1, 0)
        # if self.bias is not None:
        #     bias = torch.matmul(routing_weights, self.bias)
        #     bias = torch.split(bias, 1, 0)
        # else:
        #     bias = [None] * B
        # out = []
        # for xi, wi, bi in zip(x, weight, bias):
        #     wi = wi.view(*self.weight_shape)
        #     if bi is not None:
        #         bi = bi.view(*self.bias_shape)
        #     out.append(self.conv_fn(
        #         xi, wi, bi, stride=self.stride, padding=self.padding,
        #         dilation=self.dilation, groups=self.groups))
        # out = torch.cat(out, 0)
        return out
--- a/timm/models/layers/conv2d_same.py
+++ b/timm/models/layers/conv2d_same.py
@ -0,0 +1,66 @@
 """ Conv2d w/ Same Padding
 Hacked together by Ross Wightman
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from typing import Union, List, Tuple, Optional, Callable
 import math
 from .padding import get_padding, pad_same, is_static_pad
 def conv2d_same(
        x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
        padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
    x = pad_same(x, weight.shape[-2:], stride, dilation)
    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
 class Conv2dSame(nn.Conv2d):
    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
    """
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1, bias=True):
        super(Conv2dSame, self).__init__(
            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
    def forward(self, x):
        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
 def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
    dynamic = False
    if isinstance(padding, str):
        # for any string padding, the padding will be calculated for you, one of three ways
        padding = padding.lower()
        if padding == 'same':
            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
            if is_static_pad(kernel_size, **kwargs):
                # static case, no extra overhead
                padding = get_padding(kernel_size, **kwargs)
            else:
                # dynamic 'SAME' padding, has runtime/GPU memory overhead
                padding = 0
                dynamic = True
        elif padding == 'valid':
            # 'VALID' padding, same as padding=0
            padding = 0
        else:
            # Default to PyTorch style 'same'-ish symmetric padding
            padding = get_padding(kernel_size, **kwargs)
    return padding, dynamic
 def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
    padding = kwargs.pop('padding', '')
    kwargs.setdefault('bias', False)
    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
    if is_dynamic:
        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
    else:
        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
--- a/timm/models/layers/conv_bn_act.py
+++ b/timm/models/layers/conv_bn_act.py
@ -0,0 +1,32 @@
 """ Conv2d + BN + Act
 Hacked together by Ross Wightman
 """
 from torch import nn as nn
 from timm.models.layers import get_padding
 class ConvBnAct(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, dilation=1, groups=1,
                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        super(ConvBnAct, self).__init__()
        padding = get_padding(kernel_size, stride, dilation)  # assuming PyTorch style padding for this block
        self.conv = nn.Conv2d(
            in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
            padding=padding, dilation=dilation, groups=groups, bias=False)
        self.bn = norm_layer(out_channels)
        self.drop_block = drop_block
        if act_layer is not None:
            self.act = act_layer(inplace=True)
        else:
            self.act = None
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        if self.drop_block is not None:
            x = self.drop_block(x)
        if self.act is not None:
            x = self.act(x)
        return x
--- a/timm/models/layers/create_attn.py
+++ b/timm/models/layers/create_attn.py
@ -0,0 +1,35 @@
 """ Select AttentionFactory Method
 Hacked together by Ross Wightman
 """
 import torch
 from .se import SEModule
 from .eca import EcaModule, CecaModule
 from .cbam import CbamModule, LightCbamModule
 def create_attn(attn_type, channels, **kwargs):
    module_cls = None
    if attn_type is not None:
        if isinstance(attn_type, str):
            attn_type = attn_type.lower()
            if attn_type == 'se':
                module_cls = SEModule
            elif attn_type == 'eca':
                module_cls = EcaModule
            elif attn_type == 'eca':
                module_cls = CecaModule
            elif attn_type == 'cbam':
                module_cls = CbamModule
            elif attn_type == 'lcbam':
                module_cls = LightCbamModule
            else:
                assert False, "Invalid attn module (%s)" % attn_type
        elif isinstance(attn_type, bool):
            if attn_type:
                module_cls = SEModule
        else:
            module_cls = attn_type
    if module_cls is not None:
        return module_cls(channels, **kwargs)
    return None
--- a/timm/models/layers/create_conv2d.py
+++ b/timm/models/layers/create_conv2d.py
@ -0,0 +1,30 @@
 """ Create Conv2d Factory Method
 Hacked together by Ross Wightman
 """
 from .mixed_conv2d import MixedConv2d
 from .cond_conv2d import CondConv2d
 from .conv2d_same import create_conv2d_pad
 def create_conv2d(in_chs, out_chs, kernel_size, **kwargs):
    """ Select a 2d convolution implementation based on arguments
    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
    Used extensively by EfficientNet, MobileNetv3 and related networks.
    """
    assert 'groups' not in kwargs  # only use 'depthwise' bool arg
    if isinstance(kernel_size, list):
        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
        # We're going to use only lists for defining the MixedConv2d kernel groups,
        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
        m = MixedConv2d(in_chs, out_chs, kernel_size, **kwargs)
    else:
        depthwise = kwargs.pop('depthwise', False)
        groups = out_chs if depthwise else 1
        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
            m = CondConv2d(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
        else:
            m = create_conv2d_pad(in_chs, out_chs, kernel_size, groups=groups, **kwargs)
    return m
--- a/timm/models/layers/drop.py
+++ b/timm/models/layers/drop.py
@ -0,0 +1,109 @@
 """ DropBlock, DropPath
 PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
 Papers:
 DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
 Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
 Code:
 DropBlock impl inspired by two Tensorflow impl that I liked:
 - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
 - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
 Hacked together by Ross Wightman
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import math
 def drop_block_2d(x, drop_prob=0.1, training=False, block_size=7, gamma_scale=1.0, drop_with_noise=False):
    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
    runs with success, but needs further validation and possibly optimization for lower runtime impact.
    """
    if drop_prob == 0. or not training:
        return x
    _, _, height, width = x.shape
    total_size = width * height
    clipped_block_size = min(block_size, min(width, height))
    # seed_drop_rate, the gamma parameter
    seed_drop_rate = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
            (width - block_size + 1) *
            (height - block_size + 1))
    # Forces the block to be inside the feature map.
    w_i, h_i = torch.meshgrid(torch.arange(width).to(x.device), torch.arange(height).to(x.device))
    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < width - (clipped_block_size - 1) // 2)) & \
                  ((h_i >= clipped_block_size // 2) & (h_i < height - (clipped_block_size - 1) // 2))
    valid_block = torch.reshape(valid_block, (1, 1, height, width)).float()
    uniform_noise = torch.rand_like(x, dtype=torch.float32)
    block_mask = ((2 - seed_drop_rate - valid_block + uniform_noise) >= 1).float()
    block_mask = -F.max_pool2d(
        -block_mask,
        kernel_size=clipped_block_size,  # block_size, ???
        stride=1,
        padding=clipped_block_size // 2)
    if drop_with_noise:
        normal_noise = torch.randn_like(x)
        x = x * block_mask + normal_noise * (1 - block_mask)
    else:
        normalize_scale = block_mask.numel() / (torch.sum(block_mask) + 1e-7)
        x = x * block_mask * normalize_scale
    return x
 class DropBlock2d(nn.Module):
    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
    """
    def __init__(self,
                 drop_prob=0.1,
                 block_size=7,
                 gamma_scale=1.0,
                 with_noise=False):
        super(DropBlock2d, self).__init__()
        self.drop_prob = drop_prob
        self.gamma_scale = gamma_scale
        self.block_size = block_size
        self.with_noise = with_noise
    def forward(self, x):
        return drop_block_2d(x, self.drop_prob, self.training, self.block_size, self.gamma_scale, self.with_noise)
 def drop_path(x, drop_prob=0., training=False):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
    'survival rate' as the argument.
    """
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    random_tensor = keep_prob + torch.rand((x.size()[0], 1, 1, 1), dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output
 class DropPath(nn.ModuleDict):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob
    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)
--- a/timm/models/layers/eca.py
+++ b/timm/models/layers/eca.py
@ -0,0 +1,124 @@
 """
 ECA module from ECAnet
 paper: ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks
 https://arxiv.org/abs/1910.03151
 Original ECA model borrowed from https://github.com/BangguWu/ECANet
 Modified circular ECA implementation and adaption for use in timm package
 by Chris Ha https://github.com/VRandme
 Original License:
 MIT License
 Copyright (c) 2019 BangguWu, Qilong Wang
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
 import math
 from torch import nn
 import torch.nn.functional as F
 class EcaModule(nn.Module):
    """Constructs an ECA module.
    Args:
        channels: Number of channels of the input feature map for use in adaptive kernel sizes
            for actual calculations according to channel.
            gamma, beta: when channel is given parameters of mapping function
            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
            (default=None. if channel size not given, use k_size given for kernel size.)
        kernel_size: Adaptive selection of kernel size (default=3)
    """
    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
        super(EcaModule, self).__init__()
        assert kernel_size % 2 == 1
        if channels is not None:
            t = int(abs(math.log(channels, 2) + beta) / gamma)
            kernel_size = max(t if t % 2 else t + 1, 3)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
    def forward(self, x):
        # Feature descriptor on the global spatial information
        y = self.avg_pool(x)
        # Reshape for convolution
        y = y.view(x.shape[0], 1, -1)
        # Two different branches of ECA module
        y = self.conv(y)
        # Multi-scale information fusion
        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
        return x * y.expand_as(x)
 class CecaModule(nn.Module):
    """Constructs a circular ECA module.
    ECA module where the conv uses circular padding rather than zero padding.
    Unlike the spatial dimension, the channels do not have inherent ordering nor
    locality. Although this module in essence, applies such an assumption, it is unnecessary
    to limit the channels on either "edge" from being circularly adapted to each other.
    This will fundamentally increase connectivity and possibly increase performance metrics
    (accuracy, robustness), without signficantly impacting resource metrics
    (parameter size, throughput,latency, etc)
    Args:
        channels: Number of channels of the input feature map for use in adaptive kernel sizes
            for actual calculations according to channel.
            gamma, beta: when channel is given parameters of mapping function
            refer to original paper https://arxiv.org/pdf/1910.03151.pdf
            (default=None. if channel size not given, use k_size given for kernel size.)
        kernel_size: Adaptive selection of kernel size (default=3)
    """
    def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1):
        super(CecaModule, self).__init__()
        assert kernel_size % 2 == 1
        if channels is not None:
            t = int(abs(math.log(channels, 2) + beta) / gamma)
            kernel_size = max(t if t % 2 else t + 1, 3)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        #pytorch circular padding mode is buggy as of pytorch 1.4
        #see https://github.com/pytorch/pytorch/pull/17240
        #implement manual circular padding
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=False)
        self.padding = (kernel_size - 1) // 2
    def forward(self, x):
        # Feature descriptor on the global spatial information
        y = self.avg_pool(x)
        # Manually implement circular padding, F.pad does not seemed to be bugged
        y = F.pad(y.view(x.shape[0], 1, -1), (self.padding, self.padding), mode='circular')
        # Two different branches of ECA module
        y = self.conv(y)
        # Multi-scale information fusion
        y = y.view(x.shape[0], -1, 1, 1).sigmoid()
        return x * y.expand_as(x)
--- a/timm/models/layers/helpers.py
+++ b/timm/models/layers/helpers.py
@ -0,0 +1,27 @@
 """ Layer/Module Helpers
 Hacked together by Ross Wightman
 """
 from itertools import repeat
 from torch._six import container_abcs
 # From PyTorch internals
 def _ntuple(n):
    def parse(x):
        if isinstance(x, container_abcs.Iterable):
            return x
        return tuple(repeat(x, n))
    return parse
 tup_single = _ntuple(1)
 tup_pair = _ntuple(2)
 tup_triple = _ntuple(3)
 tup_quadruple = _ntuple(4)
--- a/timm/models/layers/median_pool.py
+++ b/timm/models/layers/median_pool.py
--- a/timm/models/layers/mixed_conv2d.py
+++ b/timm/models/layers/mixed_conv2d.py
@ -0,0 +1,51 @@
 """ PyTorch Mixed Convolution
 Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595)
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from .conv2d_same import create_conv2d_pad
 def _split_channels(num_chan, num_groups):
    split = [num_chan // num_groups for _ in range(num_groups)]
    split[0] += num_chan - sum(split)
    return split
 class MixedConv2d(nn.ModuleDict):
    """ Mixed Grouped Convolution
    Based on MDConv and GroupedConv in MixNet impl:
      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
    """
    def __init__(self, in_channels, out_channels, kernel_size=3,
                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
        super(MixedConv2d, self).__init__()
        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
        num_groups = len(kernel_size)
        in_splits = _split_channels(in_channels, num_groups)
        out_splits = _split_channels(out_channels, num_groups)
        self.in_channels = sum(in_splits)
        self.out_channels = sum(out_splits)
        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
            conv_groups = out_ch if depthwise else 1
            # use add_module to keep key space clean
            self.add_module(
                str(idx),
                create_conv2d_pad(
                    in_ch, out_ch, k, stride=stride,
                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
            )
        self.splits = in_splits
    def forward(self, x):
        x_split = torch.split(x, self.splits, 1)
        x_out = [c(x_split[i]) for i, c in enumerate(self.values())]
        x = torch.cat(x_out, 1)
        return x
--- a/timm/models/layers/padding.py
+++ b/timm/models/layers/padding.py
@ -0,0 +1,33 @@
 """ Padding Helpers
 Hacked together by Ross Wightman
 """
 import math
 from typing import List
 import torch.nn.functional as F
 # Calculate symmetric padding for a convolution
 def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    return padding
 # Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
 def get_same_padding(x: int, k: int, s: int, d: int):
    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
 # Can SAME padding for given args be done statically?
 def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
 # Dynamically pad input x with 'SAME' padding for conv with specified args
 def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1)):
    ih, iw = x.size()[-2:]
    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
    if pad_h > 0 or pad_w > 0:
        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2])
    return x
--- a/timm/models/layers/se.py
+++ b/timm/models/layers/se.py
@ -0,0 +1,21 @@
 from torch import nn as nn
 class SEModule(nn.Module):
    def __init__(self, channels, reduction=16, act_layer=nn.ReLU):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        reduction_channels = max(channels // reduction, 8)
        self.fc1 = nn.Conv2d(
            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
        self.act = act_layer(inplace=True)
        self.fc2 = nn.Conv2d(
            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.fc1(x_se)
        x_se = self.act(x_se)
        x_se = self.fc2(x_se)
        return x * x_se.sigmoid()
--- a/timm/models/layers/selective_kernel.py
+++ b/timm/models/layers/selective_kernel.py
@ -0,0 +1,120 @@
 """ Selective Kernel Convolution/Attention
 Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
 Hacked together by Ross Wightman
 """
 import torch
 from torch import nn as nn
 from .conv_bn_act import ConvBnAct
 def _kernel_valid(k):
    if isinstance(k, (list, tuple)):
        for ki in k:
            return _kernel_valid(ki)
    assert k >= 3 and k % 2
 class SelectiveKernelAttn(nn.Module):
    def __init__(self, channels, num_paths=2, attn_channels=32,
                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        """ Selective Kernel Attention Module
        Selective Kernel attention mechanism factored out into its own module.
        """
        super(SelectiveKernelAttn, self).__init__()
        self.num_paths = num_paths
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
        self.bn = norm_layer(attn_channels)
        self.act = act_layer(inplace=True)
        self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
    def forward(self, x):
        assert x.shape[1] == self.num_paths
        x = torch.sum(x, dim=1)
        x = self.pool(x)
        x = self.fc_reduce(x)
        x = self.bn(x)
        x = self.act(x)
        x = self.fc_select(x)
        B, C, H, W = x.shape
        x = x.view(B, self.num_paths, C // self.num_paths, H, W)
        x = torch.softmax(x, dim=1)
        return x
 class SelectiveKernelConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=None, stride=1, dilation=1, groups=1,
                 attn_reduction=16, min_attn_channels=32, keep_3x3=True, split_input=False,
                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
        """ Selective Kernel Convolution Module
        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
        Largest change is the input split, which divides the input channels across each convolution path, this can
        be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps
        the parameter count from ballooning when the convolutions themselves don't have groups, but still provides
        a noteworthy increase in performance over similar param count models without this attention layer. -Ross W
        Args:
            in_channels (int):  module input (feature) channel count
            out_channels (int):  module output (feature) channel count
            kernel_size (int, list): kernel size for each convolution branch
            stride (int): stride for convolutions
            dilation (int): dilation for module as a whole, impacts dilation of each branch
            groups (int): number of groups for each branch
            attn_reduction (int, float): reduction factor for attention features
            min_attn_channels (int): minimum attention feature channels
            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
                can be viewed as grouping by path, output expands to module out_channels count
            drop_block (nn.Module): drop block module
            act_layer (nn.Module): activation layer to use
            norm_layer (nn.Module): batchnorm/norm layer to use
        """
        super(SelectiveKernelConv, self).__init__()
        kernel_size = kernel_size or [3, 5]  # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation
        _kernel_valid(kernel_size)
        if not isinstance(kernel_size, list):
            kernel_size = [kernel_size] * 2
        if keep_3x3:
            dilation = [dilation * (k - 1) // 2 for k in kernel_size]
            kernel_size = [3] * len(kernel_size)
        else:
            dilation = [dilation] * len(kernel_size)
        self.num_paths = len(kernel_size)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.split_input = split_input
        if self.split_input:
            assert in_channels % self.num_paths == 0
            in_channels = in_channels // self.num_paths
        groups = min(out_channels, groups)
        conv_kwargs = dict(
            stride=stride, groups=groups, drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
        self.paths = nn.ModuleList([
            ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
            for k, d in zip(kernel_size, dilation)])
        attn_channels = max(int(out_channels / attn_reduction), min_attn_channels)
        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
        self.drop_block = drop_block
    def forward(self, x):
        if self.split_input:
            x_split = torch.split(x, self.in_channels // self.num_paths, 1)
            x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)]
        else:
            x_paths = [op(x) for op in self.paths]
        x = torch.stack(x_paths, dim=1)
        x_attn = self.attn(x)
        x = x * x_attn
        x = torch.sum(x, dim=1)
        return x
--- a/timm/models/layers/split_batchnorm.py
+++ b/timm/models/layers/split_batchnorm.py
--- a/timm/models/layers/test_time_pool.py
+++ b/timm/models/layers/test_time_pool.py
@ -1,3 +1,8 @@
 """ Test Time Pooling (Average-Max Pool)
 Hacked together by Ross Wightman
 """
 import logging
 from torch import nn
 import torch.nn.functional as F
@ -29,6 +34,8 @@ class TestTimePoolHead(nn.Module):
 def apply_test_time_pool(model, config, args):
    test_time_pool = False
    if not hasattr(model, 'default_cfg') or not model.default_cfg:
        return model, False
    if not args.no_test_pool and \
            config['input_size'][-1] > model.default_cfg['input_size'][-1] and \
            config['input_size'][-2] > model.default_cfg['input_size'][-2]:
--- a/timm/models/mobilenetv3.py
+++ b/timm/models/mobilenetv3.py
@ -7,15 +7,12 @@ Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
 Hacked together by Ross Wightman
 """
 import torch.nn as nn
 import torch.nn.functional as F
 from .efficientnet_builder import *
 from .activations import HardSwish, hard_sigmoid
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d, create_conv2d
-from .conv2d_layers import select_conv2d
+from .layers.activations import HardSwish, hard_sigmoid
 from .feature_hooks import FeatureHooks
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
@ -74,7 +71,7 @@ class MobileNetV3(nn.Module):
    """
    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
-                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0.,
+                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
        super(MobileNetV3, self).__init__()
@ -85,7 +82,7 @@ class MobileNetV3(nn.Module):
        # Stem
        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -93,14 +90,14 @@ class MobileNetV3(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features
        self._in_chs = builder.in_chs
        # Head + Pooling
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
-        self.conv_head = select_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
+        self.conv_head = create_conv2d(self._in_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
        self.act2 = act_layer(inplace=True)
        # Classifier
@ -151,7 +148,7 @@ class MobileNetV3Features(nn.Module):
    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='pre_pwl',
                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
-                 act_layer=nn.ReLU, drop_rate=0., drop_connect_rate=0., se_kwargs=None,
+                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., se_kwargs=None,
                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
        super(MobileNetV3Features, self).__init__()
        norm_kwargs = norm_kwargs or {}
@ -165,7 +162,7 @@ class MobileNetV3Features(nn.Module):
        # Stem
        stem_size = round_channels(stem_size, channel_multiplier)
-        self.conv_stem = select_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
+        self.conv_stem = create_conv2d(self._in_chs, stem_size, 3, stride=2, padding=pad_type)
        self.bn1 = norm_layer(stem_size, **norm_kwargs)
        self.act1 = act_layer(inplace=True)
        self._in_chs = stem_size
@ -173,7 +170,7 @@ class MobileNetV3Features(nn.Module):
        # Middle stages (IR/ER/DS Blocks)
        builder = EfficientNetBuilder(
            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
-            norm_layer, norm_kwargs, drop_connect_rate, feature_location=feature_location, verbose=_DEBUG)
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
        self.blocks = nn.Sequential(*builder(self._in_chs, block_args))
        self.feature_info = builder.features  # builder provides info about feature channels for each block
        self._in_chs = builder.in_chs
--- a/timm/models/nasnet.py
+++ b/timm/models/nasnet.py
@ -4,7 +4,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 __all__ = ['NASNetALarge']
--- a/timm/models/pnasnet.py
+++ b/timm/models/pnasnet.py
@ -14,7 +14,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 __all__ = ['PNASNet5Large']
--- a/timm/models/res2net.py
+++ b/timm/models/res2net.py
@ -8,10 +8,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .resnet import ResNet, SEModule
+from .resnet import ResNet
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SEModule
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 __all__ = []
@ -53,15 +53,16 @@ class Bottle2neck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None,
-                 cardinality=1, base_width=26, scale=4, use_se=False,
+                 cardinality=1, base_width=26, scale=4, dilation=1, first_dilation=None,
-                 act_layer=nn.ReLU, norm_layer=None, dilation=1, previous_dilation=1, **_):
+                 act_layer=nn.ReLU, norm_layer=None, attn_layer=None, **_):
        super(Bottle2neck, self).__init__()
        self.scale = scale
        self.is_first = stride > 1 or downsample is not None
        self.num_scales = max(1, scale - 1)
        width = int(math.floor(planes * (base_width / 64.0))) * cardinality
        outplanes = planes * self.expansion
        self.width = width
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation
        self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False)
        self.bn1 = norm_layer(width * scale)
@ -70,8 +71,8 @@ class Bottle2neck(nn.Module):
        bns = []
        for i in range(self.num_scales):
            convs.append(nn.Conv2d(
-                width, width, kernel_size=3, stride=stride, padding=dilation,
+                width, width, kernel_size=3, stride=stride, padding=first_dilation,
-                dilation=dilation, groups=cardinality, bias=False))
+                dilation=first_dilation, groups=cardinality, bias=False))
            bns.append(norm_layer(width))
        self.convs = nn.ModuleList(convs)
        self.bns = nn.ModuleList(bns)
@ -81,11 +82,14 @@ class Bottle2neck(nn.Module):
        self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+        self.se = attn_layer(outplanes) if attn_layer is not None else None
        self.relu = act_layer(inplace=True)
        self.downsample = downsample
    def zero_init_last_bn(self):
        nn.init.zeros_(self.bn3.weight)
    def forward(self, x):
        residual = x
--- a/timm/models/resnet.py
+++ b/timm/models/resnet.py
@ -7,13 +7,12 @@ ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered ste
 """
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d, DropBlock2d, DropPath, AvgPool2dSame, create_attn
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
@ -100,136 +99,182 @@ default_cfgs = {
    'seresnext26tn_32x4d': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26tn_32x4d-569cb627.pth',
        interpolation='bicubic'),
    'ecaresnext26tn_32x4d': _cfg(
        url='',
        interpolation='bicubic'),
    'ecaresnet18': _cfg(),
    'ecaresnet50': _cfg(),
 }
-def _get_padding(kernel_size, stride, dilation=1):
+def get_padding(kernel_size, stride, dilation=1):
    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    return padding
 class SEModule(nn.Module):
    def __init__(self, channels, reduction_channels):
        super(SEModule, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc1 = nn.Conv2d(
            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
        self.relu = nn.ReLU(inplace=True)
        self.fc2 = nn.Conv2d(
            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
    def forward(self, x):
        x_se = self.avg_pool(x)
        x_se = self.fc1(x_se)
        x_se = self.relu(x_se)
        x_se = self.fc2(x_se)
        return x * x_se.sigmoid()
 class BasicBlock(nn.Module):
    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 1
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
+    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
-                 cardinality=1, base_width=64, use_se=False,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 reduce_first=1, dilation=1, previous_dilation=1, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+                 attn_layer=None, drop_block=None, drop_path=None):
        super(BasicBlock, self).__init__()
        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
        assert base_width == 64, 'BasicBlock doest not support changing base width'
        first_planes = planes // reduce_first
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation
        self.conv1 = nn.Conv2d(
-            inplanes, first_planes, kernel_size=3, stride=stride, padding=dilation,
+            inplanes, first_planes, kernel_size=3, stride=stride, padding=first_dilation,
-            dilation=dilation, bias=False)
+            dilation=first_dilation, bias=False)
        self.bn1 = norm_layer(first_planes)
        self.act1 = act_layer(inplace=True)
        self.conv2 = nn.Conv2d(
-            first_planes, outplanes, kernel_size=3, padding=previous_dilation,
+            first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False)
            dilation=previous_dilation, bias=False)
        self.bn2 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+
        self.se = create_attn(attn_layer, outplanes)
        self.act2 = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.drop_block = drop_block
        self.drop_path = drop_path
    def zero_init_last_bn(self):
        nn.init.zeros_(self.bn2.weight)
    def forward(self, x):
        residual = x
-        out = self.conv1(x)
+        x = self.conv1(x)
-        out = self.bn1(out)
+        x = self.bn1(x)
-        out = self.act1(out)
+        if self.drop_block is not None:
-        out = self.conv2(out)
+            x = self.drop_block(x)
-        out = self.bn2(out)
+        x = self.act1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        if self.drop_block is not None:
            x = self.drop_block(x)
        if self.se is not None:
-            out = self.se(out)
+            x = self.se(x)
-        if self.downsample is not None:
+        if self.drop_path is not None:
-            residual = self.downsample(x)
+            x = self.drop_path(x)
-        out += residual
+        if self.downsample is not None:
-        out = self.act2(out)
+            residual = self.downsample(residual)
        x += residual
        x = self.act2(x)
-        return out
+        return x
 class Bottleneck(nn.Module):
    __constants__ = ['se', 'downsample']  # for pre 1.4 torchscript compat
    expansion = 4
-    def __init__(self, inplanes, planes, stride=1, downsample=None,
+    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
-                 cardinality=1, base_width=64, use_se=False,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
-                 reduce_first=1, dilation=1, previous_dilation=1, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+                 attn_layer=None, drop_block=None, drop_path=None):
        super(Bottleneck, self).__init__()
        width = int(math.floor(planes * (base_width / 64)) * cardinality)
        first_planes = width // reduce_first
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation
        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
        self.bn1 = norm_layer(first_planes)
        self.act1 = act_layer(inplace=True)
        self.conv2 = nn.Conv2d(
            first_planes, width, kernel_size=3, stride=stride,
-            padding=dilation, dilation=dilation, groups=cardinality, bias=False)
+            padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False)
        self.bn2 = norm_layer(width)
        self.act2 = act_layer(inplace=True)
        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
        self.bn3 = norm_layer(outplanes)
-        self.se = SEModule(outplanes, planes // 4) if use_se else None
+
        self.se = create_attn(attn_layer, outplanes)
        self.act3 = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.drop_block = drop_block
        self.drop_path = drop_path
    def zero_init_last_bn(self):
        nn.init.zeros_(self.bn3.weight)
    def forward(self, x):
        residual = x
-        out = self.conv1(x)
+        x = self.conv1(x)
-        out = self.bn1(out)
+        x = self.bn1(x)
-        out = self.act1(out)
+        if self.drop_block is not None:
            x = self.drop_block(x)
        x = self.act1(x)
-        out = self.conv2(out)
+        x = self.conv2(x)
-        out = self.bn2(out)
+        x = self.bn2(x)
-        out = self.act2(out)
+        if self.drop_block is not None:
            x = self.drop_block(x)
        x = self.act2(x)
-        out = self.conv3(out)
+        x = self.conv3(x)
-        out = self.bn3(out)
+        x = self.bn3(x)
        if self.drop_block is not None:
            x = self.drop_block(x)
        if self.se is not None:
-            out = self.se(out)
+            x = self.se(x)
        if self.drop_path is not None:
            x = self.drop_path(x)
        if self.downsample is not None:
-            residual = self.downsample(x)
+            residual = self.downsample(residual)
        x += residual
        x = self.act3(x)
-        out += residual
+        return x
        out = self.act3(out)
-        return out
+
 def downsample_conv(
        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
    norm_layer = norm_layer or nn.BatchNorm2d
    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
    p = get_padding(kernel_size, stride, first_dilation)
    return nn.Sequential(*[
        nn.Conv2d(
            in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False),
        norm_layer(out_channels)
    ])
 def downsample_avg(
        in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None):
    norm_layer = norm_layer or nn.BatchNorm2d
    avg_stride = stride if dilation == 1 else 1
    if stride == 1 and dilation == 1:
        pool = nn.Identity()
    else:
        avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
    return nn.Sequential(*[
        pool,
        nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
        norm_layer(out_channels)
    ])
 class ResNet(nn.Module):
@ -273,8 +318,6 @@ class ResNet(nn.Module):
        Number of classification classes.
    in_chans : int, default 3
        Number of input (color) channels.
    use_se : bool, default False
        Enable Squeeze-Excitation module in blocks
    cardinality : int, default 1
        Number of convolution groups for 3x3 conv in Bottleneck.
    base_width : int, default 64
@ -303,11 +346,11 @@ class ResNet(nn.Module):
    global_pool : str, default 'avg'
        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
    """
-    def __init__(self, block, layers, num_classes=1000, in_chans=3, use_se=False,
+    def __init__(self, block, layers, num_classes=1000, in_chans=3,
                 cardinality=1, base_width=64, stem_width=64, stem_type='',
                 block_reduce_first=1, down_kernel_size=1, avg_down=False, output_stride=32,
-                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0.0, global_pool='avg',
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0.0, drop_path_rate=0.,
-                 zero_init_last_bn=True, block_args=None):
+                 drop_block_rate=0., global_pool='avg', zero_init_last_bn=True, block_args=None):
        block_args = block_args or dict()
        self.num_classes = num_classes
        deep_stem = 'deep' in stem_type
@ -339,6 +382,9 @@ class ResNet(nn.Module):
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # Feature Blocks
        dp = DropPath(drop_path_rate) if drop_path_rate else None
        db_3 = DropBlock2d(drop_block_rate, 7, 0.25) if drop_block_rate else None
        db_4 = DropBlock2d(drop_block_rate, 7, 1.00) if drop_block_rate else None
        channels, strides, dilations = [64, 128, 256, 512], [1, 2, 2, 2], [1] * 4
        if output_stride == 16:
            strides[3] = 1
@ -348,61 +394,47 @@ class ResNet(nn.Module):
            dilations[2:4] = [2, 4]
        else:
            assert output_stride == 32
-        llargs = list(zip(channels, layers, strides, dilations))
+        layer_args = list(zip(channels, layers, strides, dilations))
-        lkwargs = dict(
+        layer_kwargs = dict(
-            use_se=use_se, reduce_first=block_reduce_first, act_layer=act_layer, norm_layer=norm_layer,
+            reduce_first=block_reduce_first, act_layer=act_layer, norm_layer=norm_layer,
-            avg_down=avg_down, down_kernel_size=down_kernel_size, **block_args)
+            avg_down=avg_down, down_kernel_size=down_kernel_size, drop_path=dp, **block_args)
-        self.layer1 = self._make_layer(block, *llargs[0], **lkwargs)
+        self.layer1 = self._make_layer(block, *layer_args[0], **layer_kwargs)
-        self.layer2 = self._make_layer(block, *llargs[1], **lkwargs)
+        self.layer2 = self._make_layer(block, *layer_args[1], **layer_kwargs)
-        self.layer3 = self._make_layer(block, *llargs[2], **lkwargs)
+        self.layer3 = self._make_layer(block, drop_block=db_3, *layer_args[2], **layer_kwargs)
-        self.layer4 = self._make_layer(block, *llargs[3], **lkwargs)
+        self.layer4 = self._make_layer(block, drop_block=db_4, *layer_args[3], **layer_kwargs)
        # Head (Pooling and Classifier)
        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
        self.num_features = 512 * block.expansion
        self.fc = nn.Linear(self.num_features * self.global_pool.feat_mult(), num_classes)
        last_bn_name = 'bn3' if 'Bottle' in block.__name__ else 'bn2'
        for n, m in self.named_modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
-                if zero_init_last_bn and 'layer' in n and last_bn_name in n:
+                nn.init.constant_(m.weight, 1.)
                    # Initialize weight/gamma of last BN in each residual block to zero
                    nn.init.constant_(m.weight, 0.)
                else:
                    nn.init.constant_(m.weight, 1.)
                nn.init.constant_(m.bias, 0.)
        if zero_init_last_bn:
            for m in self.modules():
                if hasattr(m, 'zero_init_last_bn'):
                    m.zero_init_last_bn()
    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, reduce_first=1,
-                    use_se=False, avg_down=False, down_kernel_size=1, **kwargs):
+                    avg_down=False, down_kernel_size=1, **kwargs):
        norm_layer = kwargs.get('norm_layer')
        downsample = None
-        down_kernel_size = 1 if stride == 1 and dilation == 1 else down_kernel_size
+        first_dilation = 1 if dilation in (1, 2) else 2
        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample_padding = _get_padding(down_kernel_size, stride)
+            downsample_args = dict(
-            downsample_layers = []
+                in_channels=self.inplanes, out_channels=planes * block.expansion, kernel_size=down_kernel_size,
-            conv_stride = stride
+                stride=stride, dilation=dilation, first_dilation=first_dilation, norm_layer=kwargs.get('norm_layer'))
-            if avg_down:
+            downsample = downsample_avg(**downsample_args) if avg_down else downsample_conv(**downsample_args)
                avg_stride = stride if dilation == 1 else 1
                conv_stride = 1
                downsample_layers = [nn.AvgPool2d(avg_stride, avg_stride, ceil_mode=True, count_include_pad=False)]
            downsample_layers += [
                nn.Conv2d(self.inplanes, planes * block.expansion, down_kernel_size,
                          stride=conv_stride, padding=downsample_padding, bias=False),
                norm_layer(planes * block.expansion)]
            downsample = nn.Sequential(*downsample_layers)
-        first_dilation = 1 if dilation in (1, 2) else 2
+        block_kwargs = dict(
        bkwargs = dict(
            cardinality=self.cardinality, base_width=self.base_width, reduce_first=reduce_first,
-            use_se=use_se, **kwargs)
+            dilation=dilation, **kwargs)
-        layers = [block(
+        layers = [block(self.inplanes, planes, stride, downsample, first_dilation=first_dilation, **block_kwargs)]
            self.inplanes, planes, stride, downsample, dilation=first_dilation, previous_dilation=dilation, **bkwargs)]
        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
+        layers += [block(self.inplanes, planes, **block_kwargs) for _ in range(1, blocks)]
            layers.append(block(
                self.inplanes, planes, dilation=dilation, previous_dilation=dilation, **bkwargs))
        return nn.Sequential(*layers)
@ -430,8 +462,8 @@ class ResNet(nn.Module):
    def forward(self, x):
        x = self.forward_features(x)
        x = self.global_pool(x).flatten(1)
-        if self.drop_rate > 0.:
+        if self.drop_rate:
-            x = F.dropout(x, p=self.drop_rate, training=self.training)
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
        x = self.fc(x)
        return x
@ -903,9 +935,8 @@ def seresnext26d_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
    """
    default_cfg = default_cfgs['seresnext26d_32x4d']
    model = ResNet(
-        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
+        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, stem_type='deep', avg_down=True,
-        stem_width=32, stem_type='deep', avg_down=True, use_se=True,
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
        num_classes=num_classes, in_chans=in_chans, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -921,8 +952,8 @@ def seresnext26t_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs)
    default_cfg = default_cfgs['seresnext26t_32x4d']
    model = ResNet(
        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep_tiered', avg_down=True, use_se=True,
+        stem_width=32, stem_type='deep_tiered', avg_down=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
@ -938,8 +969,55 @@ def seresnext26tn_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs
    default_cfg = default_cfgs['seresnext26tn_32x4d']
    model = ResNet(
        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
-        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True, use_se=True,
+        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True,
-        num_classes=num_classes, in_chans=in_chans, **kwargs)
+        num_classes=num_classes, in_chans=in_chans, block_args=dict(attn_layer='se'), **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def ecaresnext26tn_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs an ECA-ResNeXt-26-TN model.
    This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels
    in the deep stem. The channel number of the middle stem conv is narrower than the 'T' variant.
    this model replaces SE module with the ECA module
    """
    default_cfg = default_cfgs['ecaresnext26tn_32x4d']
    block_args = dict(attn_layer='eca')
    model = ResNet(
        Bottleneck, [2, 2, 2, 2], cardinality=32, base_width=4,
        stem_width=32, stem_type='deep_tiered_narrow', avg_down=True,
        num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def ecaresnet18(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """ Constructs an ECA-ResNet-18 model.
    """
    default_cfg = default_cfgs['ecaresnet18']
    block_args = dict(attn_layer='eca')
    model = ResNet(
        BasicBlock, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def ecaresnet50(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs an ECA-ResNet-50 model.
    """
    default_cfg = default_cfgs['ecaresnet50']
    block_args = dict(attn_layer='eca')
    model = ResNet(
        Bottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans, block_args=block_args, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
--- a/timm/models/selecsls.py
+++ b/timm/models/selecsls.py
@ -17,7 +17,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 __all__ = ['SelecSLS']  # model_registry will add each entrypoint fn to this
--- a/timm/models/senet.py
+++ b/timm/models/senet.py
@ -16,7 +16,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 __all__ = ['SENet']
--- a/timm/models/sknet.py
+++ b/timm/models/sknet.py
@ -0,0 +1,237 @@
 """ Selective Kernel Networks (ResNet base)
 Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
 This was inspired by reading 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268)
 and a streamlined impl at https://github.com/clovaai/assembled-cnn but I ended up building something closer
 to the original paper with some modifications of my own to better balance param count vs accuracy.
 Hacked together by Ross Wightman
 """
 import math
 from torch import nn as nn
 from .registry import register_model
 from .helpers import load_pretrained
 from .layers import SelectiveKernelConv, ConvBnAct, create_attn
 from .resnet import ResNet
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 def _cfg(url='', **kwargs):
    return {
        'url': url,
        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
        'crop_pct': 0.875, 'interpolation': 'bicubic',
        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
        'first_conv': 'conv1', 'classifier': 'fc',
        **kwargs
    }
 default_cfgs = {
    'skresnet18': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet18_ra-4eec2804.pth'),
    'skresnet34': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet34_ra-bdc0ccde.pth'),
    'skresnet50': _cfg(),
    'skresnet50d': _cfg(),
    'skresnext50_32x4d': _cfg(
        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnext50_ra-f40e40bf.pth'),
 }
 class SelectiveKernelBasic(nn.Module):
    expansion = 1
    def __init__(self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
                 sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None,
                 drop_block=None, drop_path=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None):
        super(SelectiveKernelBasic, self).__init__()
        sk_kwargs = sk_kwargs or {}
        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
        assert cardinality == 1, 'BasicBlock only supports cardinality of 1'
        assert base_width == 64, 'BasicBlock doest not support changing base width'
        first_planes = planes // reduce_first
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation
        self.conv1 = SelectiveKernelConv(
            inplanes, first_planes, stride=stride, dilation=first_dilation, **conv_kwargs, **sk_kwargs)
        conv_kwargs['act_layer'] = None
        self.conv2 = ConvBnAct(
            first_planes, outplanes, kernel_size=3, dilation=dilation, **conv_kwargs)
        self.se = create_attn(attn_layer, outplanes)
        self.act = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.drop_block = drop_block
        self.drop_path = drop_path
    def zero_init_last_bn(self):
        nn.init.zeros_(self.conv2.bn.weight)
    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.conv2(x)
        if self.se is not None:
            x = self.se(x)
        if self.drop_path is not None:
            x = self.drop_path(x)
        if self.downsample is not None:
            residual = self.downsample(residual)
        x += residual
        x = self.act(x)
        return x
 class SelectiveKernelBottleneck(nn.Module):
    expansion = 4
    def __init__(self, inplanes, planes, stride=1, downsample=None,
                 cardinality=1, base_width=64, sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None,
                 drop_block=None, drop_path=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, attn_layer=None):
        super(SelectiveKernelBottleneck, self).__init__()
        sk_kwargs = sk_kwargs or {}
        conv_kwargs = dict(drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer)
        width = int(math.floor(planes * (base_width / 64)) * cardinality)
        first_planes = width // reduce_first
        outplanes = planes * self.expansion
        first_dilation = first_dilation or dilation
        self.conv1 = ConvBnAct(inplanes, first_planes, kernel_size=1, **conv_kwargs)
        self.conv2 = SelectiveKernelConv(
            first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality,
            **conv_kwargs, **sk_kwargs)
        conv_kwargs['act_layer'] = None
        self.conv3 = ConvBnAct(width, outplanes, kernel_size=1, **conv_kwargs)
        self.se = create_attn(attn_layer, outplanes)
        self.act = act_layer(inplace=True)
        self.downsample = downsample
        self.stride = stride
        self.dilation = dilation
        self.drop_block = drop_block
        self.drop_path = drop_path
    def zero_init_last_bn(self):
        nn.init.zeros_(self.conv3.bn.weight)
    def forward(self, x):
        residual = x
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        if self.se is not None:
            x = self.se(x)
        if self.drop_path is not None:
            x = self.drop_path(x)
        if self.downsample is not None:
            residual = self.downsample(residual)
        x += residual
        x = self.act(x)
        return x
@register_model
 def skresnet18(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs a Selective Kernel ResNet-18 model.
    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
    variation splits the input channels to the selective convolutions to keep param count down.
    """
    default_cfg = default_cfgs['skresnet18']
    sk_kwargs = dict(
        min_attn_channels=16,
        attn_reduction=8,
        split_input=True
    )
    model = ResNet(
        SelectiveKernelBasic, [2, 2, 2, 2], num_classes=num_classes, in_chans=in_chans,
        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def skresnet34(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs a Selective Kernel ResNet-34 model.
    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
    variation splits the input channels to the selective convolutions to keep param count down.
    """
    default_cfg = default_cfgs['skresnet34']
    sk_kwargs = dict(
        min_attn_channels=16,
        attn_reduction=8,
        split_input=True
    )
    model = ResNet(
        SelectiveKernelBasic, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def skresnet50(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs a Select Kernel ResNet-50 model.
    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
    variation splits the input channels to the selective convolutions to keep param count down.
    """
    sk_kwargs = dict(
        split_input=True,
    )
    default_cfg = default_cfgs['skresnet50']
    model = ResNet(
        SelectiveKernelBottleneck, [3, 4, 6, 3], num_classes=num_classes, in_chans=in_chans,
        block_args=dict(sk_kwargs=sk_kwargs), zero_init_last_bn=False, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def skresnet50d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs a Select Kernel ResNet-50-D model.
    Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this
    variation splits the input channels to the selective convolutions to keep param count down.
    """
    sk_kwargs = dict(
        split_input=True,
    )
    default_cfg = default_cfgs['skresnet50d']
    model = ResNet(
        SelectiveKernelBottleneck, [3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True,
        num_classes=num_classes, in_chans=in_chans, block_args=dict(sk_kwargs=sk_kwargs),
        zero_init_last_bn=False, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
@register_model
 def skresnext50_32x4d(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to
    the SKNet-50 model in the Select Kernel Paper
    """
    default_cfg = default_cfgs['skresnext50_32x4d']
    model = ResNet(
        SelectiveKernelBottleneck, [3, 4, 6, 3], cardinality=32, base_width=4,
        num_classes=num_classes, in_chans=in_chans, zero_init_last_bn=False, **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
--- a/timm/models/xception.py
+++ b/timm/models/xception.py
@ -29,7 +29,7 @@ import torch.nn.functional as F
 from .registry import register_model
 from .helpers import load_pretrained
-from .adaptive_avgmax_pool import SelectAdaptivePool2d
+from .layers import SelectAdaptivePool2d
 __all__ = ['Xception']
--- a/train.py
+++ b/train.py
@ -81,10 +81,14 @@ parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
                    help='input batch size for training (default: 32)')
 parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
                    help='ratio of validation batch size to training batch size (default: 1)')
-parser.add_argument('--drop', type=float, default=0.0, metavar='DROP',
+parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
                    help='Dropout rate (default: 0.)')
-parser.add_argument('--drop-connect', type=float, default=0.0, metavar='DROP',
+parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
-                    help='Drop connect rate (default: 0.)')
+                    help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
 parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
                    help='Drop path rate (default: None)')
 parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
                    help='Drop block rate (default: None)')
 parser.add_argument('--jsd', action='store_true', default=False,
                    help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
 # Optimizer parameters
@ -242,7 +246,9 @@ def main():
        pretrained=args.pretrained,
        num_classes=args.num_classes,
        drop_rate=args.drop,
-        drop_connect_rate=args.drop_connect,
+        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
        drop_path_rate=args.drop_path,
        drop_block_rate=args.drop_block,
        global_pool=args.gp,
        bn_tf=args.bn_tf,
        bn_momentum=args.bn_momentum,
--- a/validate.py
+++ b/validate.py
@ -211,11 +211,24 @@ def main():
        logging.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names)))
        results = []
        try:
            start_batch_size = args.batch_size
            for m, c in model_cfgs:
                batch_size = start_batch_size
                args.model = m
                args.checkpoint = c
                result = OrderedDict(model=args.model)
-                r = validate(args)
+                r = {}
                while not r and batch_size >= args.num_gpu:
                    try:
                        args.batch_size = batch_size
                        print('Validating with batch size: %d' % args.batch_size)
                        r = validate(args)
                    except RuntimeError as e:
                        if batch_size <= args.num_gpu:
                            print("Validation failed with no ability to reduce batch size. Exiting.")
                            raise e
                        batch_size = max(batch_size // 2, args.num_gpu)
                        print("Validation failed, reducing batch size by 50%")
                result.update(r)
                if args.checkpoint:
                    result['checkpoint'] = args.checkpoint