diff --git a/archived_changes/index.html b/archived_changes/index.html
index 9bd6daea..36e366df 100644
--- a/archived_changes/index.html
+++ b/archived_changes/index.html
@@ -297,6 +297,20 @@
     </label>
     <ul class="md-nav__list" data-md-scrollfix>
       
+        <li class="md-nav__item">
+  <a href="#april-5-2020" class="md-nav__link">
+    April 5, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#march-18-2020" class="md-nav__link">
+    March 18, 2020
+  </a>
+  
+</li>
+      
         <li class="md-nav__item">
   <a href="#feb-29-2020" class="md-nav__link">
     Feb 29, 2020
@@ -427,6 +441,20 @@
     </label>
     <ul class="md-nav__list" data-md-scrollfix>
       
+        <li class="md-nav__item">
+  <a href="#april-5-2020" class="md-nav__link">
+    April 5, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#march-18-2020" class="md-nav__link">
+    March 18, 2020
+  </a>
+  
+</li>
+      
         <li class="md-nav__item">
   <a href="#feb-29-2020" class="md-nav__link">
     Feb 29, 2020
@@ -546,6 +574,21 @@
                 
                 
                 <h1 id="archived-changes">Archived Changes</h1>
+<h3 id="april-5-2020">April 5, 2020</h3>
+<ul>
+<li>Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite<ul>
+<li>3.5M param MobileNet-V2 100 @ 73%</li>
+<li>4.5M param MobileNet-V2 110d @ 75%</li>
+<li>6.1M param MobileNet-V2 140 @ 76.5%</li>
+<li>5.8M param MobileNet-V2 120d @ 77.3%</li>
+</ul>
+</li>
+</ul>
+<h3 id="march-18-2020">March 18, 2020</h3>
+<ul>
+<li>Add EfficientNet-Lite models w/ weights ported from <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite">Tensorflow TPU</a></li>
+<li>Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by <a href="https://github.com/andravin">Andrew Lavin</a> (see Training section for hparams)</li>
+</ul>
 <h3 id="feb-29-2020">Feb 29, 2020</h3>
 <ul>
 <li>New MobileNet-V3 Large weights trained from stratch with this code to 75.77% top-1</li>
diff --git a/changes/index.html b/changes/index.html
index 145879fc..53ce4d34 100644
--- a/changes/index.html
+++ b/changes/index.html
@@ -286,50 +286,85 @@
     <ul class="md-nav__list" data-md-scrollfix>
       
         <li class="md-nav__item">
-  <a href="#aug-5-2020" class="md-nav__link">
-    Aug 5, 2020
+  <a href="#oct-30-2020" class="md-nav__link">
+    Oct 30, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#june-11-2020" class="md-nav__link">
-    June 11, 2020
+  <a href="#oct-26-2020" class="md-nav__link">
+    Oct 26, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-12-2020" class="md-nav__link">
-    May 12, 2020
+  <a href="#oct-21-2020" class="md-nav__link">
+    Oct 21, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-3-2020" class="md-nav__link">
-    May 3, 2020
+  <a href="#oct-13-2020" class="md-nav__link">
+    Oct 13, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-1-2020" class="md-nav__link">
-    May 1, 2020
+  <a href="#sept-18-2020" class="md-nav__link">
+    Sept 18, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sept-3-2020" class="md-nav__link">
+    Sept 3, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#aug-12-2020" class="md-nav__link">
+    Aug 12, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#aug-5-2020" class="md-nav__link">
+    Aug 5, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#june-11-2020" class="md-nav__link">
+    June 11, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#april-5-2020" class="md-nav__link">
-    April 5, 2020
+  <a href="#may-12-2020" class="md-nav__link">
+    May 12, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#march-18-2020" class="md-nav__link">
-    March 18, 2020
+  <a href="#may-3-2020" class="md-nav__link">
+    May 3, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#may-1-2020" class="md-nav__link">
+    May 1, 2020
   </a>
   
 </li>
@@ -379,50 +414,85 @@
     <ul class="md-nav__list" data-md-scrollfix>
       
         <li class="md-nav__item">
-  <a href="#aug-5-2020" class="md-nav__link">
-    Aug 5, 2020
+  <a href="#oct-30-2020" class="md-nav__link">
+    Oct 30, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#june-11-2020" class="md-nav__link">
-    June 11, 2020
+  <a href="#oct-26-2020" class="md-nav__link">
+    Oct 26, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-12-2020" class="md-nav__link">
-    May 12, 2020
+  <a href="#oct-21-2020" class="md-nav__link">
+    Oct 21, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-3-2020" class="md-nav__link">
-    May 3, 2020
+  <a href="#oct-13-2020" class="md-nav__link">
+    Oct 13, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#may-1-2020" class="md-nav__link">
-    May 1, 2020
+  <a href="#sept-18-2020" class="md-nav__link">
+    Sept 18, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#sept-3-2020" class="md-nav__link">
+    Sept 3, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#aug-12-2020" class="md-nav__link">
+    Aug 12, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#april-5-2020" class="md-nav__link">
-    April 5, 2020
+  <a href="#aug-5-2020" class="md-nav__link">
+    Aug 5, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#june-11-2020" class="md-nav__link">
+    June 11, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#may-12-2020" class="md-nav__link">
+    May 12, 2020
+  </a>
+  
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#may-3-2020" class="md-nav__link">
+    May 3, 2020
   </a>
   
 </li>
       
         <li class="md-nav__item">
-  <a href="#march-18-2020" class="md-nav__link">
-    March 18, 2020
+  <a href="#may-1-2020" class="md-nav__link">
+    May 1, 2020
   </a>
   
 </li>
@@ -448,6 +518,62 @@
                 
                 
                 <h1 id="recent-changes">Recent Changes</h1>
+<h3 id="oct-30-2020">Oct 30, 2020</h3>
+<ul>
+<li>Test with PyTorch 1.7 and fix a small top-n metric view vs reshape issue.</li>
+<li>Convert newly added 224x224 Vision Transformer weights from official JAX repo. 81.8 top-1 for B/16, 83.1 L/16.</li>
+<li>Support PyTorch 1.7 optimized, native SiLU (aka Swish) activation. Add mapping to 'silu' name, custom swish will eventually be deprecated.</li>
+<li>Fix regression for loading pretrained classifier via direct model entrypoint functions. Didn't impact create_model() factory usage.</li>
+<li>PyPi release @ 0.3.0 version!</li>
+</ul>
+<h3 id="oct-26-2020">Oct 26, 2020</h3>
+<ul>
+<li>Update Vision Transformer models to be compatible with official code release at <a href="https://github.com/google-research/vision_transformer">https://github.com/google-research/vision_transformer</a></li>
+<li>Add Vision Transformer weights (ImageNet-21k pretrain) for 384x384 base and large models converted from official jax impl<ul>
+<li>ViT-B/16 - 84.2</li>
+<li>ViT-B/32 - 81.7</li>
+<li>ViT-L/16 - 85.2</li>
+<li>ViT-L/32 - 81.5</li>
+</ul>
+</li>
+</ul>
+<h3 id="oct-21-2020">Oct 21, 2020</h3>
+<ul>
+<li>Weights added for Vision Transformer (ViT) models. 77.86 top-1 for 'small' and 79.35 for 'base'. Thanks to <a href="https://www.kaggle.com/christofhenkel">Christof</a> for training the base model w/ lots of GPUs.</li>
+</ul>
+<h3 id="oct-13-2020">Oct 13, 2020</h3>
+<ul>
+<li>Initial impl of Vision Transformer models. Both patch and hybrid (CNN backbone) variants. Currently trying to train...</li>
+<li>Adafactor and AdaHessian (FP32 only, no AMP) optimizers</li>
+<li>EdgeTPU-M (<code>efficientnet_em</code>) model trained in PyTorch, 79.3 top-1</li>
+<li>Pip release, doc updates pending a few more changes...</li>
+</ul>
+<h3 id="sept-18-2020">Sept 18, 2020</h3>
+<ul>
+<li>New ResNet 'D' weights. 72.7 (top-1) ResNet-18-D, 77.1 ResNet-34-D, 80.5 ResNet-50-D</li>
+<li>Added a few untrained defs for other ResNet models (66D, 101D, 152D, 200/200D)</li>
+</ul>
+<h3 id="sept-3-2020">Sept 3, 2020</h3>
+<ul>
+<li>New weights<ul>
+<li>Wide-ResNet50 - 81.5 top-1 (vs 78.5 torchvision)</li>
+<li>SEResNeXt50-32x4d - 81.3 top-1 (vs 79.1 cadene)</li>
+</ul>
+</li>
+<li>Support for native Torch AMP and channels_last memory format added to train/validate scripts (<code>--channels-last</code>, <code>--native-amp</code> vs <code>--apex-amp</code>)</li>
+<li>Models tested with channels_last on latest NGC 20.08 container. AdaptiveAvgPool in attn layers changed to mean((2,3)) to work around bug with NHWC kernel.</li>
+</ul>
+<h3 id="aug-12-2020">Aug 12, 2020</h3>
+<ul>
+<li>New/updated weights from training experiments<ul>
+<li>EfficientNet-B3 - 82.1 top-1 (vs 81.6 for official with AA and 81.9 for AdvProp)</li>
+<li>RegNetY-3.2GF - 82.0 top-1 (78.9 from official ver)</li>
+<li>CSPResNet50 - 79.6 top-1 (76.6 from official ver)</li>
+</ul>
+</li>
+<li>Add CutMix integrated w/ Mixup. See <a href="https://github.com/rwightman/pytorch-image-models/pull/218">pull request</a> for some usage examples</li>
+<li>Some fixes for using pretrained weights with <code>in_chans</code> != 3 on several models.</li>
+</ul>
 <h3 id="aug-5-2020">Aug 5, 2020</h3>
 <p>Universal feature extraction, new models, new weights, new test sets.</p>
 <ul>
@@ -455,20 +581,22 @@
 <li>New models<ul>
 <li>CSPResNet, CSPResNeXt, CSPDarkNet, DarkNet</li>
 <li>ReXNet</li>
-<li>(Aligned) Xception41/65/71 (a proper port of TF models)</li>
+<li>(Modified Aligned) Xception41/65/71 (a proper port of TF models)</li>
 </ul>
 </li>
 <li>New trained weights<ul>
-<li>SEResNet50 - 80.3</li>
+<li>SEResNet50 - 80.3 top-1</li>
 <li>CSPDarkNet53 - 80.1 top-1</li>
-<li>CSPResNeXt50 - 80.0 to-1</li>
+<li>CSPResNeXt50 - 80.0 top-1</li>
 <li>DPN68b - 79.2 top-1</li>
-<li>EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by @hal-314)</li>
+<li>EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by <a href="https://github.com/hal-314">@hal-314</a>)</li>
 </ul>
 </li>
 <li>Add 'real' labels for ImageNet and ImageNet-Renditions test set, see <a href="results/README.md"><code>results/README.md</code></a></li>
+<li>Test set ranking/top-n diff script by <a href="https://github.com/KushajveerSingh">@KushajveerSingh</a></li>
 <li>Train script and loader/transform tweaks to punch through more aug arguments</li>
 <li>README and documentation overhaul. See initial (WIP) documentation at <a href="https://rwightman.github.io/pytorch-image-models/">https://rwightman.github.io/pytorch-image-models/</a></li>
+<li>adamp and sgdp optimizers added by <a href="https://github.com/hellbell">@hellbell</a></li>
 </ul>
 <h3 id="june-11-2020">June 11, 2020</h3>
 <p>Bunch of changes:</p>
@@ -504,21 +632,6 @@
 </li>
 <li>200 pretrained models in total now with updated results csv in results folder</li>
 </ul>
-<h3 id="april-5-2020">April 5, 2020</h3>
-<ul>
-<li>Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite<ul>
-<li>3.5M param MobileNet-V2 100 @ 73%</li>
-<li>4.5M param MobileNet-V2 110d @ 75%</li>
-<li>6.1M param MobileNet-V2 140 @ 76.5%</li>
-<li>5.8M param MobileNet-V2 120d @ 77.3%</li>
-</ul>
-</li>
-</ul>
-<h3 id="march-18-2020">March 18, 2020</h3>
-<ul>
-<li>Add EfficientNet-Lite models w/ weights ported from <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite">Tensorflow TPU</a></li>
-<li>Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by <a href="https://github.com/andravin">Andrew Lavin</a> (see Training section for hparams)</li>
-</ul>
                 
               
               
diff --git a/index.html b/index.html
index 8fe534ea..30fc0481 100644
--- a/index.html
+++ b/index.html
@@ -415,11 +415,11 @@
 <p class="admonition-title">Conda Environment</p>
 <p>All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x., 3.8.x.</p>
 <p>Little to no care has been taken to be Python 2.x friendly and will not support it. If you run into any challenges running on Windows, or other OS, I'm definitely open to looking into those issues so long as it's in a reproducible (read Conda) environment.</p>
-<p>PyTorch versions 1.4, 1.5.x, and 1.6 have been tested with this code.</p>
+<p>PyTorch versions 1.4, 1.5.x, 1.6, and 1.7 have been tested with this code.</p>
 <p>I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda:
 <div class="highlight"><pre><span></span><code>conda create -n torch-env
 conda activate torch-env
-conda install -c pytorch pytorch torchvision cudatoolkit=10.2
+conda install -c pytorch pytorch torchvision cudatoolkit=11
 conda install pyyaml
 </code></pre></div></p>
 </div>
@@ -427,7 +427,7 @@ conda install pyyaml
 <p>Pretrained models can be loaded using <code>timm.create_model</code></p>
 <div class="highlight"><pre><span></span><code><span class="kn">import</span> <span class="nn">timm</span>
 
-<span class="n">m</span> <span class="o">=</span> <span class="n">timm</span><span class="o">.</span><span class="n">create_model</span><span class="p">(</span><span class="s1">&#39;mobilenetv3_100&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
+<span class="n">m</span> <span class="o">=</span> <span class="n">timm</span><span class="o">.</span><span class="n">create_model</span><span class="p">(</span><span class="s1">&#39;mobilenetv3_large_100&#39;</span><span class="p">,</span> <span class="n">pretrained</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
 <span class="n">m</span><span class="o">.</span><span class="n">eval</span><span class="p">()</span>
 </code></pre></div>
 
diff --git a/models/index.html b/models/index.html
index 69dbad06..3e9f4f4e 100644
--- a/models/index.html
+++ b/models/index.html
@@ -370,6 +370,13 @@
     TResNet [tresnet.py]
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#vision-transformer-vision_transformerpy" class="md-nav__link">
+    Vision Transformer [vision_transformer.py]
+  </a>
+  
 </li>
       
         <li class="md-nav__item">
@@ -649,6 +656,13 @@
     TResNet [tresnet.py]
   </a>
   
+</li>
+      
+        <li class="md-nav__item">
+  <a href="#vision-transformer-vision_transformerpy" class="md-nav__link">
+    Vision Transformer [vision_transformer.py]
+  </a>
+  
 </li>
       
         <li class="md-nav__item">
@@ -864,6 +878,11 @@
 <li>Paper: <code>TResNet: High Performance GPU-Dedicated Architecture</code> - <a href="https://arxiv.org/abs/2003.13630">https://arxiv.org/abs/2003.13630</a></li>
 <li>Code: <a href="https://github.com/mrT23/TResNet">https://github.com/mrT23/TResNet</a></li>
 </ul>
+<h2 id="vision-transformer-vision_transformerpy">Vision Transformer [<a href="https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py">vision_transformer.py</a>]</h2>
+<ul>
+<li>Paper: <code>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</code> - <a href="https://arxiv.org/abs/2010.11929">https://arxiv.org/abs/2010.11929</a></li>
+<li>Reference code and pretrained weights: <a href="https://github.com/google-research/vision_transformer">https://github.com/google-research/vision_transformer</a></li>
+</ul>
 <h2 id="vovnet-v2-and-v1-vovnetpy">VovNet V2 and V1 [<a href="https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vovnet.py">vovnet.py</a>]</h2>
 <ul>
 <li>Paper: <code>CenterMask : Real-Time Anchor-Free Instance Segmentation</code> - <a href="https://arxiv.org/abs/1911.06667">https://arxiv.org/abs/1911.06667</a></li>
diff --git a/results/index.html b/results/index.html
index 860b2440..a71dc939 100644
--- a/results/index.html
+++ b/results/index.html
@@ -245,8 +245,8 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#ported-weights" class="md-nav__link">
-    Ported Weights
+  <a href="#ported-and-other-weights" class="md-nav__link">
+    Ported and Other Weights
   </a>
   
 </li>
@@ -351,8 +351,8 @@
 </li>
       
         <li class="md-nav__item">
-  <a href="#ported-weights" class="md-nav__link">
-    Ported Weights
+  <a href="#ported-and-other-weights" class="md-nav__link">
+    Ported and Other Weights
   </a>
   
 </li>
@@ -378,7 +378,7 @@
                 
                 
                 <h1 id="results">Results</h1>
-<p>CSV files containing an ImageNet-1K validation and OOD test set validation results for all included models with pretrained weights and default configurations is located <a href="https://github.com/rwightman/pytorch-image-models/tree/master/results">here</a>.</p>
+<p>CSV files containing an ImageNet-1K validation and out-of-distribution (OOD) test set validation results for all included models with pretrained weights and default configurations is located <a href="https://github.com/rwightman/pytorch-image-models/tree/master/results">here</a>.</p>
 <h2 id="self-trained-weights">Self-trained Weights</h2>
 <p>I've leveraged the training scripts in this repository to train a few of the models with to good levels of performance.</p>
 <table>
@@ -395,21 +395,29 @@
 <tbody>
 <tr>
 <td>efficientnet_b3a</td>
-<td>81.874 (18.126)</td>
-<td>95.840 (4.160)</td>
+<td>82.242 (17.758)</td>
+<td>96.114 (3.886)</td>
 <td>12.23</td>
 <td>bicubic</td>
 <td>320 (1.0 crop)</td>
 </tr>
 <tr>
 <td>efficientnet_b3</td>
-<td>81.498 (18.502)</td>
-<td>95.718 (4.282)</td>
+<td>82.076 (17.924)</td>
+<td>96.020 (3.980)</td>
 <td>12.23</td>
 <td>bicubic</td>
 <td>300</td>
 </tr>
 <tr>
+<td>regnet_32</td>
+<td>82.002 (17.998)</td>
+<td>95.906 (4.094)</td>
+<td>19.44</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>skresnext50d_32x4d</td>
 <td>81.278 (18.722)</td>
 <td>95.366 (4.634)</td>
@@ -418,6 +426,14 @@
 <td>288 (1.0 crop)</td>
 </tr>
 <tr>
+<td>seresnext50d_32x4d</td>
+<td>81.266 (18.734)</td>
+<td>95.620 (4.380)</td>
+<td>27.6</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>efficientnet_b2a</td>
 <td>80.608 (19.392)</td>
 <td>95.310 (4.690)</td>
@@ -426,6 +442,14 @@
 <td>288 (1.0 crop)</td>
 </tr>
 <tr>
+<td>resnet50d</td>
+<td>80.530 (19.470)</td>
+<td>95.160 (4.840)</td>
+<td>25.6</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>mixnet_xl</td>
 <td>80.478 (19.522)</td>
 <td>94.932 (5.068)</td>
@@ -442,6 +466,14 @@
 <td>260</td>
 </tr>
 <tr>
+<td>seresnet50</td>
+<td>80.274 (19.726)</td>
+<td>95.070 (4.930)</td>
+<td>28.1</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>skresnext50d_32x4d</td>
 <td>80.156 (19.844)</td>
 <td>94.642 (5.358)</td>
@@ -450,6 +482,22 @@
 <td>224</td>
 </tr>
 <tr>
+<td>cspdarknet53</td>
+<td>80.058 (19.942)</td>
+<td>95.084 (4.916)</td>
+<td>27.6</td>
+<td>bicubic</td>
+<td>256</td>
+</tr>
+<tr>
+<td>cspresnext50</td>
+<td>80.040 (19.960)</td>
+<td>94.944 (5.056)</td>
+<td>20.6</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>resnext50_32x4d</td>
 <td>79.762 (20.238)</td>
 <td>94.600 (5.400)</td>
@@ -466,6 +514,14 @@
 <td>224</td>
 </tr>
 <tr>
+<td>cspresnet50</td>
+<td>79.574 (20.426)</td>
+<td>94.712 (5.288)</td>
+<td>21.6</td>
+<td>bicubic</td>
+<td>256</td>
+</tr>
+<tr>
 <td>ese_vovnet39b</td>
 <td>79.320 (20.680)</td>
 <td>94.710 (5.290)</td>
@@ -482,6 +538,14 @@
 <td>224</td>
 </tr>
 <tr>
+<td>dpn68b</td>
+<td>79.216 (20.784)</td>
+<td>94.414 (5.586)</td>
+<td>12.6</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>resnet50</td>
 <td>79.038 (20.962)</td>
 <td>94.390 (5.610)</td>
@@ -562,6 +626,14 @@
 <td>224</td>
 </tr>
 <tr>
+<td>resnet34d</td>
+<td>77.116 (22.884)</td>
+<td>93.382 (6.618)</td>
+<td>21.8</td>
+<td>bicubic</td>
+<td>224</td>
+</tr>
+<tr>
 <td>seresnext26_32x4d</td>
 <td>77.104 (22.896)</td>
 <td>93.316 (6.684)</td>
@@ -714,999 +786,26 @@
 <td>224</td>
 </tr>
 <tr>
-<td>seresnet18</td>
-<td>71.742 (28.258)</td>
-<td>90.334 (9.666)</td>
-<td>11.8</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-</tbody>
-</table>
-<h2 id="ported-weights">Ported Weights</h2>
-<p>For the models below, the model code and weight porting from Tensorflow or MXNet Gluon to Pytorch was done by myself. There are weights/models ported by others included in this repository, they are not listed below.</p>
-<table>
-<thead>
-<tr>
-<th>Model</th>
-<th>Acc@1 (Err)</th>
-<th>Acc@5 (Err)</th>
-<th>Param # (M)</th>
-<th>Interpolation</th>
-<th>Image Size</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td>tf_efficientnet_l2_ns *tfp</td>
-<td>88.352 (11.648)</td>
-<td>98.652 (1.348)</td>
-<td>480</td>
-<td>bicubic</td>
-<td>800</td>
-</tr>
-<tr>
-<td>tf_efficientnet_l2_ns</td>
-<td>TBD</td>
-<td>TBD</td>
-<td>480</td>
-<td>bicubic</td>
-<td>800</td>
-</tr>
-<tr>
-<td>tf_efficientnet_l2_ns_475</td>
-<td>88.234 (11.766)</td>
-<td>98.546 (1.454)f</td>
-<td>480</td>
-<td>bicubic</td>
-<td>475</td>
-</tr>
-<tr>
-<td>tf_efficientnet_l2_ns_475 *tfp</td>
-<td>88.172 (11.828)</td>
-<td>98.566 (1.434)</td>
-<td>480</td>
-<td>bicubic</td>
-<td>475</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7_ns *tfp</td>
-<td>86.844 (13.156)</td>
-<td>98.084 (1.916)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7_ns</td>
-<td>86.840 (13.160)</td>
-<td>98.094 (1.906)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6_ns</td>
-<td>86.452 (13.548)</td>
-<td>97.882 (2.118)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6_ns *tfp</td>
-<td>86.444 (13.556)</td>
-<td>97.880 (2.120)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5_ns *tfp</td>
-<td>86.064 (13.936)</td>
-<td>97.746 (2.254)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5_ns</td>
-<td>86.088 (13.912)</td>
-<td>97.752 (2.248)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b8_ap *tfp</td>
-<td>85.436 (14.564)</td>
-<td>97.272 (2.728)</td>
-<td>87.4</td>
-<td>bicubic</td>
-<td>672</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b8 *tfp</td>
-<td>85.384 (14.616)</td>
-<td>97.394 (2.606)</td>
-<td>87.4</td>
-<td>bicubic</td>
-<td>672</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b8</td>
-<td>85.370 (14.630)</td>
-<td>97.390 (2.610)</td>
-<td>87.4</td>
-<td>bicubic</td>
-<td>672</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b8_ap</td>
-<td>85.368 (14.632)</td>
-<td>97.294 (2.706)</td>
-<td>87.4</td>
-<td>bicubic</td>
-<td>672</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4_ns *tfp</td>
-<td>85.298 (14.702)</td>
-<td>97.504 (2.496)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4_ns</td>
-<td>85.162 (14.838)</td>
-<td>97.470 (2.530)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7_ap *tfp</td>
-<td>85.154 (14.846)</td>
-<td>97.244 (2.756)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7_ap</td>
-<td>85.118 (14.882)</td>
-<td>97.252 (2.748)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7 *tfp</td>
-<td>84.940 (15.060)</td>
-<td>97.214 (2.786)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b7</td>
-<td>84.932 (15.068)</td>
-<td>97.208 (2.792)</td>
-<td>66.35</td>
-<td>bicubic</td>
-<td>600</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6_ap</td>
-<td>84.786 (15.214)</td>
-<td>97.138 (2.862)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6_ap *tfp</td>
-<td>84.760 (15.240)</td>
-<td>97.124 (2.876)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5_ap *tfp</td>
-<td>84.276 (15.724)</td>
-<td>96.932 (3.068)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5_ap</td>
-<td>84.254 (15.746)</td>
-<td>96.976 (3.024)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6 *tfp</td>
-<td>84.140 (15.860)</td>
-<td>96.852 (3.148)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b6</td>
-<td>84.110 (15.890)</td>
-<td>96.886 (3.114)</td>
-<td>43.04</td>
-<td>bicubic</td>
-<td>528</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3_ns *tfp</td>
-<td>84.054 (15.946)</td>
-<td>96.918 (3.082)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3_ns</td>
-<td>84.048 (15.952)</td>
-<td>96.910 (3.090)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5 *tfp</td>
-<td>83.822 (16.178)</td>
-<td>96.756 (3.244)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b5</td>
-<td>83.812 (16.188)</td>
-<td>96.748 (3.252)</td>
-<td>30.39</td>
-<td>bicubic</td>
-<td>456</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4_ap *tfp</td>
-<td>83.278 (16.722)</td>
-<td>96.376 (3.624)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4_ap</td>
-<td>83.248 (16.752)</td>
-<td>96.388 (3.612)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4</td>
-<td>83.022 (16.978)</td>
-<td>96.300 (3.700)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b4 *tfp</td>
-<td>82.948 (17.052)</td>
-<td>96.308 (3.692)</td>
-<td>19.34</td>
-<td>bicubic</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b2_ns *tfp</td>
-<td>82.436 (17.564)</td>
-<td>96.268 (3.732)</td>
-<td>9.11</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b2_ns</td>
-<td>82.380 (17.620)</td>
-<td>96.248 (3.752)</td>
-<td>9.11</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3_ap *tfp</td>
-<td>81.882 (18.118)</td>
-<td>95.662 (4.338)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3_ap</td>
-<td>81.828 (18.172)</td>
-<td>95.624 (4.376)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3</td>
-<td>81.636 (18.364)</td>
-<td>95.718 (4.282)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b3 *tfp</td>
-<td>81.576 (18.424)</td>
-<td>95.662 (4.338)</td>
-<td>12.23</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite4</td>
-<td>81.528 (18.472)</td>
-<td>95.668 (4.332)</td>
-<td>13.00</td>
-<td>bilinear</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1_ns *tfp</td>
-<td>81.514 (18.486)</td>
-<td>95.776 (4.224)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite4 *tfp</td>
-<td>81.502 (18.498)</td>
-<td>95.676 (4.324)</td>
-<td>13.00</td>
-<td>bilinear</td>
-<td>380</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1_ns</td>
-<td>81.388 (18.612)</td>
-<td>95.738 (4.262)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>gluon_senet154</td>
-<td>81.224 (18.776)</td>
-<td>95.356 (4.644)</td>
-<td>115.09</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet152_v1s</td>
-<td>81.012 (18.988)</td>
-<td>95.416 (4.584)</td>
-<td>60.32</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_seresnext101_32x4d</td>
-<td>80.902 (19.098)</td>
-<td>95.294 (4.706)</td>
-<td>48.96</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_seresnext101_64x4d</td>
-<td>80.890 (19.110)</td>
-<td>95.304 (4.696)</td>
-<td>88.23</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnext101_64x4d</td>
-<td>80.602 (19.398)</td>
-<td>94.994 (5.006)</td>
-<td>83.46</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_el</td>
-<td>80.534 (19.466)</td>
-<td>95.190 (4.810)</td>
-<td>10.59</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_el *tfp</td>
-<td>80.476 (19.524)</td>
-<td>95.200 (4.800)</td>
-<td>10.59</td>
-<td>bicubic</td>
-<td>300</td>
-</tr>
-<tr>
-<td>gluon_resnet152_v1d</td>
-<td>80.470 (19.530)</td>
-<td>95.206 (4.794)</td>
-<td>60.21</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet101_v1d</td>
-<td>80.424 (19.576)</td>
-<td>95.020 (4.980)</td>
-<td>44.57</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b2_ap *tfp</td>
-<td>80.420 (19.580)</td>
-<td>95.040 (4.960)</td>
-<td>9.11</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>gluon_resnext101_32x4d</td>
-<td>80.334 (19.666)</td>
-<td>94.926 (5.074)</td>
-<td>44.18</td>
+<td>resnet18d</td>
+<td>72.260 (27.740)</td>
+<td>90.696 (9.304)</td>
+<td>11.7</td>
 <td>bicubic</td>
 <td>224</td>
 </tr>
 <tr>
-<td>tf_efficientnet_b2_ap</td>
-<td>80.306 (19.694)</td>
-<td>95.028 (4.972)</td>
-<td>9.11</td>
+<td>seresnet18</td>
+<td>71.742 (28.258)</td>
+<td>90.334 (9.666)</td>
+<td>11.8</td>
 <td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>gluon_resnet101_v1s</td>
-<td>80.300 (19.700)</td>
-<td>95.150 (4.850)</td>
-<td>44.67</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b2 *tfp</td>
-<td>80.188 (19.812)</td>
-<td>94.974 (5.026)</td>
-<td>9.11</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b2</td>
-<td>80.086 (19.914)</td>
-<td>94.908 (5.092)</td>
-<td>9.11</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>gluon_resnet152_v1c</td>
-<td>79.916 (20.084)</td>
-<td>94.842 (5.158)</td>
-<td>60.21</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_seresnext50_32x4d</td>
-<td>79.912 (20.088)</td>
-<td>94.818 (5.182)</td>
-<td>27.56</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite3</td>
-<td>79.812 (20.188)</td>
-<td>94.914 (5.086)</td>
-<td>8.20</td>
-<td>bilinear</td>
-<td>300</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite3 *tfp</td>
-<td>79.734 (20.266)</td>
-<td>94.838 (5.162)</td>
-<td>8.20</td>
-<td>bilinear</td>
-<td>300</td>
-</tr>
-<tr>
-<td>gluon_resnet152_v1b</td>
-<td>79.692 (20.308)</td>
-<td>94.738 (5.262)</td>
-<td>60.19</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_xception65</td>
-<td>79.604 (20.396)</td>
-<td>94.748 (5.252)</td>
-<td>39.92</td>
-<td>bicubic</td>
-<td>299</td>
-</tr>
-<tr>
-<td>gluon_resnet101_v1c</td>
-<td>79.544 (20.456)</td>
-<td>94.586 (5.414)</td>
-<td>44.57</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1_ap *tfp</td>
-<td>79.532 (20.468)</td>
-<td>94.378 (5.622)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b1_8e *tfp</td>
-<td>79.464 (20.536)</td>
-<td>94.492 (5.508)</td>
-<td>39.7</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>gluon_resnext50_32x4d</td>
-<td>79.356 (20.644)</td>
-<td>94.424 (5.576)</td>
-<td>25.03</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet101_v1b</td>
-<td>79.304 (20.696)</td>
-<td>94.524 (5.476)</td>
-<td>44.55</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b1_8e</td>
-<td>79.298 (20.702)</td>
-<td>94.364 (5.636)</td>
-<td>39.7</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1_ap</td>
-<td>79.278 (20.722)</td>
-<td>94.308 (5.692)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1 *tfp</td>
-<td>79.172 (20.828)</td>
-<td>94.450 (5.550)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>gluon_resnet50_v1d</td>
-<td>79.074 (20.926)</td>
-<td>94.476 (5.524)</td>
-<td>25.58</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_em *tfp</td>
-<td>78.958 (21.042)</td>
-<td>94.458 (5.542)</td>
-<td>6.90</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_mixnet_l *tfp</td>
-<td>78.846 (21.154)</td>
-<td>94.212 (5.788)</td>
-<td>7.33</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b1</td>
-<td>78.826 (21.174)</td>
-<td>94.198 (5.802)</td>
-<td>7.79</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0_ns *tfp</td>
-<td>78.806 (21.194)</td>
-<td>94.496 (5.504)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_inception_v3</td>
-<td>78.804 (21.196)</td>
-<td>94.380 (5.620)</td>
-<td>27.16M</td>
-<td>bicubic</td>
-<td>299</td>
-</tr>
-<tr>
-<td>tf_mixnet_l</td>
-<td>78.770 (21.230)</td>
-<td>94.004 (5.996)</td>
-<td>7.33</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_em</td>
-<td>78.742 (21.258)</td>
-<td>94.332 (5.668)</td>
-<td>6.90</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>gluon_resnet50_v1s</td>
-<td>78.712 (21.288)</td>
-<td>94.242 (5.758)</td>
-<td>25.68</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0_ns</td>
-<td>78.658 (21.342)</td>
-<td>94.376 (5.624)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b0_8e *tfp</td>
-<td>78.314 (21.686)</td>
-<td>93.790 (6.210)</td>
-<td>24.0</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet50_v1c</td>
-<td>78.010 (21.990)</td>
-<td>93.988 (6.012)</td>
-<td>25.58</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b0_8e</td>
-<td>77.908 (22.092)</td>
-<td>93.656 (6.344)</td>
-<td>24.0</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_inception_v3</td>
-<td>77.856 (22.144)</td>
-<td>93.644 (6.356)</td>
-<td>27.16M</td>
-<td>bicubic</td>
-<td>299</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b0_4e *tfp</td>
-<td>77.746 (22.254)</td>
-<td>93.552 (6.448)</td>
-<td>13.3</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_es *tfp</td>
-<td>77.616 (22.384)</td>
-<td>93.750 (6.250)</td>
-<td>5.44</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet50_v1b</td>
-<td>77.578 (22.422)</td>
-<td>93.718 (6.282)</td>
-<td>25.56</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>adv_inception_v3</td>
-<td>77.576 (22.424)</td>
-<td>93.724 (6.276)</td>
-<td>27.16M</td>
-<td>bicubic</td>
-<td>299</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite2 *tfp</td>
-<td>77.544 (22.456)</td>
-<td>93.800 (6.200)</td>
-<td>6.09</td>
-<td>bilinear</td>
-<td>260</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite2</td>
-<td>77.460 (22.540)</td>
-<td>93.746 (6.254)</td>
-<td>6.09</td>
-<td>bicubic</td>
-<td>260</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0_ap *tfp</td>
-<td>77.514 (22.486)</td>
-<td>93.576 (6.424)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_cc_b0_4e</td>
-<td>77.304 (22.696)</td>
-<td>93.332 (6.668)</td>
-<td>13.3</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_es</td>
-<td>77.264 (22.736)</td>
-<td>93.600 (6.400)</td>
-<td>5.44</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0 *tfp</td>
-<td>77.258 (22.742)</td>
-<td>93.478 (6.522)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0_ap</td>
-<td>77.084 (22.916)</td>
-<td>93.254 (6.746)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mixnet_m *tfp</td>
-<td>77.072 (22.928)</td>
-<td>93.368 (6.632)</td>
-<td>5.01</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mixnet_m</td>
-<td>76.950 (23.050)</td>
-<td>93.156 (6.844)</td>
-<td>5.01</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_b0</td>
-<td>76.848 (23.152)</td>
-<td>93.228 (6.772)</td>
-<td>5.29</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite1 *tfp</td>
-<td>76.764 (23.236)</td>
-<td>93.326 (6.674)</td>
-<td>5.42</td>
-<td>bilinear</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite1</td>
-<td>76.638 (23.362)</td>
-<td>93.232 (6.768)</td>
-<td>5.42</td>
-<td>bicubic</td>
-<td>240</td>
-</tr>
-<tr>
-<td>tf_mixnet_s *tfp</td>
-<td>75.800 (24.200)</td>
-<td>92.788 (7.212)</td>
-<td>4.13</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_100 *tfp</td>
-<td>75.768 (24.232)</td>
-<td>92.710 (7.290)</td>
-<td>5.48</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mixnet_s</td>
-<td>75.648 (24.352)</td>
-<td>92.636 (7.364)</td>
-<td>4.13</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_100</td>
-<td>75.516 (24.484)</td>
-<td>92.600 (7.400)</td>
-<td>5.48</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite0 *tfp</td>
-<td>75.074 (24.926)</td>
-<td>92.314 (7.686)</td>
-<td>4.65</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_efficientnet_lite0</td>
-<td>74.842 (25.158)</td>
-<td>92.170 (7.830)</td>
-<td>4.65</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>gluon_resnet34_v1b</td>
-<td>74.580 (25.420)</td>
-<td>91.988 (8.012)</td>
-<td>21.80</td>
-<td>bicubic</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_075 *tfp</td>
-<td>73.730 (26.270)</td>
-<td>91.616 (8.384)</td>
-<td>3.99</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_075</td>
-<td>73.442 (26.558)</td>
-<td>91.352 (8.648)</td>
-<td>3.99</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_minimal_100 *tfp</td>
-<td>72.678 (27.322)</td>
-<td>90.860 (9.140)</td>
-<td>3.92</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_large_minimal_100</td>
-<td>72.244 (27.756)</td>
-<td>90.636 (9.364)</td>
-<td>3.92</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_100 *tfp</td>
-<td>67.918 (32.082)</td>
-<td>87.958 (12.042</td>
-<td>2.54</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_100</td>
-<td>67.918 (32.082)</td>
-<td>87.662 (12.338)</td>
-<td>2.54</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_075 *tfp</td>
-<td>66.142 (33.858)</td>
-<td>86.498 (13.502)</td>
-<td>2.04</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_075</td>
-<td>65.718 (34.282)</td>
-<td>86.136 (13.864)</td>
-<td>2.04</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_minimal_100 *tfp</td>
-<td>63.378 (36.622)</td>
-<td>84.802 (15.198)</td>
-<td>2.04</td>
-<td>bilinear</td>
-<td>224</td>
-</tr>
-<tr>
-<td>tf_mobilenetv3_small_minimal_100</td>
-<td>62.898 (37.102)</td>
-<td>84.230 (15.770)</td>
-<td>2.04</td>
-<td>bilinear</td>
 <td>224</td>
 </tr>
 </tbody>
 </table>
-<p>Models with <code>*tfp</code> next to them were scored with <code>--tf-preprocessing</code> flag. </p>
-<p>The <code>tf_efficientnet</code>, <code>tf_mixnet</code> models require an equivalent for 'SAME' padding as their arch results in asymmetric padding. I've added this in the model creation wrapper, but it does come with a performance penalty. </p>
-<p>Sources for original weights:
-* <code>tf_efficientnet*</code>: <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet">Tensorflow TPU</a>
-* <code>tf_efficientnet_e*</code>: <a href="https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu">Tensorflow TPU</a>
-* <code>tf_mixnet*</code>: <a href="https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet">Tensorflow TPU</a>
-* <code>tf_inception*</code>: <a href="https://github.com/tensorflow/models/tree/master/research/slim">Tensorflow Slim</a>
-* <code>gluon_*</code>: <a href="https://gluon-cv.mxnet.io/model_zoo/classification.html">MxNet Gluon</a></p>
+<h2 id="ported-and-other-weights">Ported and Other Weights</h2>
+<p>For weights ported from other deep learning frameworks (Tensorflow, MXNet GluonCV) or copied from other PyTorch sources, please see the full results tables for ImageNet and various OOD test sets at in the <a href="https://github.com/rwightman/pytorch-image-models/tree/master/results">results tables</a>.</p>
+<p>Model code .py files contain links to original sources of models and weights.</p>
                 
               
               
diff --git a/search/search_index.json b/search/search_index.json
index e5eef407..41a568a7 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Getting Started Install The library can be installed with pip: pip install timm Conda Environment All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x., 3.8.x. Little to no care has been taken to be Python 2.x friendly and will not support it. If you run into any challenges running on Windows, or other OS, I'm definitely open to looking into those issues so long as it's in a reproducible (read Conda) environment. PyTorch versions 1.4, 1.5.x, and 1.6 have been tested with this code. I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda: conda create -n torch-env conda activate torch-env conda install -c pytorch pytorch torchvision cudatoolkit=10.2 conda install pyyaml Load a Pretrained Model Pretrained models can be loaded using timm.create_model import timm m = timm . create_model ( 'mobilenetv3_100' , pretrained = True ) m . eval () List Models with Pretrained Weights import timm from pprint import pprint model_names = timm . list_models ( pretrained = True ) pprint ( model_names ) >>> [ 'adv_inception_v3' , 'cspdarknet53' , 'cspresnext50' , 'densenet121' , 'densenet161' , 'densenet169' , 'densenet201' , 'densenetblur121d' , 'dla34' , 'dla46_c' , ... ] List Model Architectures by Wildcard import timm from pprint import pprint model_names = timm . list_models ( '*resne*t*' ) pprint ( model_names ) >>> [ 'cspresnet50' , 'cspresnet50d' , 'cspresnet50w' , 'cspresnext50' , ... ]","title":"Getting Started"},{"location":"#getting-started","text":"","title":"Getting Started"},{"location":"#install","text":"The library can be installed with pip: pip install timm Conda Environment All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x., 3.8.x. Little to no care has been taken to be Python 2.x friendly and will not support it. If you run into any challenges running on Windows, or other OS, I'm definitely open to looking into those issues so long as it's in a reproducible (read Conda) environment. PyTorch versions 1.4, 1.5.x, and 1.6 have been tested with this code. I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda: conda create -n torch-env conda activate torch-env conda install -c pytorch pytorch torchvision cudatoolkit=10.2 conda install pyyaml","title":"Install"},{"location":"#load-a-pretrained-model","text":"Pretrained models can be loaded using timm.create_model import timm m = timm . create_model ( 'mobilenetv3_100' , pretrained = True ) m . eval ()","title":"Load a Pretrained Model"},{"location":"#list-models-with-pretrained-weights","text":"import timm from pprint import pprint model_names = timm . list_models ( pretrained = True ) pprint ( model_names ) >>> [ 'adv_inception_v3' , 'cspdarknet53' , 'cspresnext50' , 'densenet121' , 'densenet161' , 'densenet169' , 'densenet201' , 'densenetblur121d' , 'dla34' , 'dla46_c' , ... ]","title":"List Models with Pretrained Weights"},{"location":"#list-model-architectures-by-wildcard","text":"import timm from pprint import pprint model_names = timm . list_models ( '*resne*t*' ) pprint ( model_names ) >>> [ 'cspresnet50' , 'cspresnet50d' , 'cspresnet50w' , 'cspresnext50' , ... ]","title":"List Model Architectures by Wildcard"},{"location":"archived_changes/","text":"Archived Changes Feb 29, 2020 New MobileNet-V3 Large weights trained from stratch with this code to 75.77% top-1 IMPORTANT CHANGE - default weight init changed for all MobilenetV3 / EfficientNet / related models overall results similar to a bit better training from scratch on a few smaller models tried performance early in training seems consistently improved but less difference by end set fix_group_fanout=False in _init_weight_goog fn if you need to reproducte past behaviour Experimental LR noise feature added applies a random perturbation to LR each epoch in specified range of training Feb 18, 2020 Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' ( https://arxiv.org/abs/2001.06268 ): Move layer/module impl into layers subfolder/module of models and organize in a more granular fashion ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks Add Selective Kernel Nets on top of ResNet base, pretrained weights skresnet18 - 73% top-1 skresnet34 - 76.9% top-1 skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1 ECA and CECA (circular padding) attention layer contributed by Chris Ha CBAM attention experiment (not the best results so far, may remove) Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the .se position for all ResNets Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights Feb 12, 2020 Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from Tensorflow TPU Feb 6, 2020 Add RandAugment trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by Andrew Lavin (see Training section for hparams) Feb \u00bd, 2020 Port new EfficientNet-B8 (RandAugment) weights, these are different than the B8 AdvProp, different input normalization. Update results csv files on all models for ImageNet validation and three other test sets Push PyPi package update Jan 31, 2020 Update ResNet50 weights with a new 79.038 result from further JSD / AugMix experiments. Full command line for reproduction in training section below. Jan 11/12, 2020 Master may be a bit unstable wrt to training, these changes have been tested but not all combos Implementations of AugMix added to existing RA and AA. Including numerous supporting pieces like JSD loss (Jensen-Shannon divergence + CE), and AugMixDataset SplitBatchNorm adaptation layer added for implementing Auxiliary BN as per AdvProp paper ResNet-50 AugMix trained model w/ 79% top-1 added seresnext26tn_32x4d - 77.99 top-1, 93.75 top-5 added to tiered experiment, higher img/s than 't' and 'd' Jan 3, 2020 Add RandAugment trained EfficientNet-B0 weight with 77.7 top-1. Trained by Michael Klachko with this code and recent hparams (see Training section) Add avg_checkpoints.py script for post training weight averaging and update all scripts with header docstrings and shebangs. Dec 30, 2019 Merge Dushyant Mehta's PR for SelecSLS (Selective Short and Long Range Skip Connections) networks. Good GPU memory consumption and throughput. Original: https://github.com/mehtadushy/SelecSLS-Pytorch Dec 28, 2019 Add new model weights and training hparams (see Training Hparams section) efficientnet_b3 - 81.5 top-1, 95.7 top-5 at default res/crop, 81.9, 95.8 at 320x320 1.0 crop-pct trained with RandAugment, ended up with an interesting but less than perfect result (see training section) seresnext26d_32x4d - 77.6 top-1, 93.6 top-5 deep stem (32, 32, 64), avgpool downsample stem/dowsample from bag-of-tricks paper seresnext26t_32x4d - 78.0 top-1, 93.7 top-5 deep tiered stem (24, 48, 64), avgpool downsample (a modified 'D' variant) stem sizing mods from Jeremy Howard and fastai devs discussing ResNet architecture experiments Dec 23, 2019 Add RandAugment trained MixNet-XL weights with 80.48 top-1. --dist-bn argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval Dec 4, 2019 Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5). Nov 29, 2019 Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded. AdvProp weights added Official TF MobileNetv3 weights added EfficientNet and MobileNetV3 hook based 'feature extraction' classes added. Will serve as basis for using models as backbones in obj detection/segmentation tasks. Lots more to be done here... HRNet classification models and weights added from https://github.com/HRNet/HRNet-Image-Classification Consistency in global pooling, reset_classifer , and forward_features across models forward_features always returns unpooled feature maps now Reasonable chance I broke something... let me know Nov 22, 2019 Add ImageNet training RandAugment implementation alongside AutoAugment. PyTorch Transform compatible format, using PIL. Currently training two EfficientNet models from scratch with promising results... will update. drop-connect cmd line arg finally added to train.py , no need to hack model fns. Works for efficientnet/mobilenetv3 based models, ignored otherwise.","title":"Archived Changes"},{"location":"archived_changes/#archived-changes","text":"","title":"Archived Changes"},{"location":"archived_changes/#feb-29-2020","text":"New MobileNet-V3 Large weights trained from stratch with this code to 75.77% top-1 IMPORTANT CHANGE - default weight init changed for all MobilenetV3 / EfficientNet / related models overall results similar to a bit better training from scratch on a few smaller models tried performance early in training seems consistently improved but less difference by end set fix_group_fanout=False in _init_weight_goog fn if you need to reproducte past behaviour Experimental LR noise feature added applies a random perturbation to LR each epoch in specified range of training","title":"Feb 29, 2020"},{"location":"archived_changes/#feb-18-2020","text":"Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' ( https://arxiv.org/abs/2001.06268 ): Move layer/module impl into layers subfolder/module of models and organize in a more granular fashion ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks Add Selective Kernel Nets on top of ResNet base, pretrained weights skresnet18 - 73% top-1 skresnet34 - 76.9% top-1 skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1 ECA and CECA (circular padding) attention layer contributed by Chris Ha CBAM attention experiment (not the best results so far, may remove) Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the .se position for all ResNets Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights","title":"Feb 18, 2020"},{"location":"archived_changes/#feb-12-2020","text":"Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from Tensorflow TPU","title":"Feb 12, 2020"},{"location":"archived_changes/#feb-6-2020","text":"Add RandAugment trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by Andrew Lavin (see Training section for hparams)","title":"Feb 6, 2020"},{"location":"archived_changes/#feb-12-2020_1","text":"Port new EfficientNet-B8 (RandAugment) weights, these are different than the B8 AdvProp, different input normalization. Update results csv files on all models for ImageNet validation and three other test sets Push PyPi package update","title":"Feb 1/2, 2020"},{"location":"archived_changes/#jan-31-2020","text":"Update ResNet50 weights with a new 79.038 result from further JSD / AugMix experiments. Full command line for reproduction in training section below.","title":"Jan 31, 2020"},{"location":"archived_changes/#jan-1112-2020","text":"Master may be a bit unstable wrt to training, these changes have been tested but not all combos Implementations of AugMix added to existing RA and AA. Including numerous supporting pieces like JSD loss (Jensen-Shannon divergence + CE), and AugMixDataset SplitBatchNorm adaptation layer added for implementing Auxiliary BN as per AdvProp paper ResNet-50 AugMix trained model w/ 79% top-1 added seresnext26tn_32x4d - 77.99 top-1, 93.75 top-5 added to tiered experiment, higher img/s than 't' and 'd'","title":"Jan 11/12, 2020"},{"location":"archived_changes/#jan-3-2020","text":"Add RandAugment trained EfficientNet-B0 weight with 77.7 top-1. Trained by Michael Klachko with this code and recent hparams (see Training section) Add avg_checkpoints.py script for post training weight averaging and update all scripts with header docstrings and shebangs.","title":"Jan 3, 2020"},{"location":"archived_changes/#dec-30-2019","text":"Merge Dushyant Mehta's PR for SelecSLS (Selective Short and Long Range Skip Connections) networks. Good GPU memory consumption and throughput. Original: https://github.com/mehtadushy/SelecSLS-Pytorch","title":"Dec 30, 2019"},{"location":"archived_changes/#dec-28-2019","text":"Add new model weights and training hparams (see Training Hparams section) efficientnet_b3 - 81.5 top-1, 95.7 top-5 at default res/crop, 81.9, 95.8 at 320x320 1.0 crop-pct trained with RandAugment, ended up with an interesting but less than perfect result (see training section) seresnext26d_32x4d - 77.6 top-1, 93.6 top-5 deep stem (32, 32, 64), avgpool downsample stem/dowsample from bag-of-tricks paper seresnext26t_32x4d - 78.0 top-1, 93.7 top-5 deep tiered stem (24, 48, 64), avgpool downsample (a modified 'D' variant) stem sizing mods from Jeremy Howard and fastai devs discussing ResNet architecture experiments","title":"Dec 28, 2019"},{"location":"archived_changes/#dec-23-2019","text":"Add RandAugment trained MixNet-XL weights with 80.48 top-1. --dist-bn argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval","title":"Dec 23, 2019"},{"location":"archived_changes/#dec-4-2019","text":"Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).","title":"Dec 4, 2019"},{"location":"archived_changes/#nov-29-2019","text":"Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded. AdvProp weights added Official TF MobileNetv3 weights added EfficientNet and MobileNetV3 hook based 'feature extraction' classes added. Will serve as basis for using models as backbones in obj detection/segmentation tasks. Lots more to be done here... HRNet classification models and weights added from https://github.com/HRNet/HRNet-Image-Classification Consistency in global pooling, reset_classifer , and forward_features across models forward_features always returns unpooled feature maps now Reasonable chance I broke something... let me know","title":"Nov 29, 2019"},{"location":"archived_changes/#nov-22-2019","text":"Add ImageNet training RandAugment implementation alongside AutoAugment. PyTorch Transform compatible format, using PIL. Currently training two EfficientNet models from scratch with promising results... will update. drop-connect cmd line arg finally added to train.py , no need to hack model fns. Works for efficientnet/mobilenetv3 based models, ignored otherwise.","title":"Nov 22, 2019"},{"location":"changes/","text":"Recent Changes Aug 5, 2020 Universal feature extraction, new models, new weights, new test sets. All models support the features_only=True argument for create_model call to return a network that extracts features from the deepest layer at each stride. New models CSPResNet, CSPResNeXt, CSPDarkNet, DarkNet ReXNet (Aligned) Xception41/65/71 (a proper port of TF models) New trained weights SEResNet50 - 80.3 CSPDarkNet53 - 80.1 top-1 CSPResNeXt50 - 80.0 to-1 DPN68b - 79.2 top-1 EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by @hal-314) Add 'real' labels for ImageNet and ImageNet-Renditions test set, see results/README.md Train script and loader/transform tweaks to punch through more aug arguments README and documentation overhaul. See initial (WIP) documentation at https://rwightman.github.io/pytorch-image-models/ June 11, 2020 Bunch of changes: DenseNet models updated with memory efficient addition from torchvision (fixed a bug), blur pooling and deep stem additions VoVNet V1 and V2 models added, 39 V2 variant (ese_vovnet_39b) trained to 79.3 top-1 Activation factory added along with new activations: select act at model creation time for more flexibility in using activations compatible with scripting or tracing (ONNX export) hard_mish (experimental) added with memory-efficient grad, along with ME hard_swish context mgr for setting exportable/scriptable/no_jit states Norm + Activation combo layers added with initial trial support in DenseNet and VoVNet along with impl of EvoNorm and InplaceAbn wrapper that fit the interface Torchscript works for all but two of the model types as long as using Pytorch 1.5+, tests added for this Some import cleanup and classifier reset changes, all models will have classifier reset to nn.Identity on reset_classifer(0) call Prep for 0.1.28 pip release May 12, 2020 Add ResNeSt models (code adapted from https://github.com/zhanghang1989/ResNeSt , paper https://arxiv.org/abs/2004.08955 )) May 3, 2020 Pruned EfficientNet B1, B2, and B3 ( https://arxiv.org/abs/2002.08258 ) contributed by Yonathan Aflalo May 1, 2020 Merged a number of execellent contributions in the ResNet model family over the past month BlurPool2D and resnetblur models initiated by Chris Ha , I trained resnetblur50 to 79.3. TResNet models and SpaceToDepth, AntiAliasDownsampleLayer layers by mrT23 ecaresnet (50d, 101d, light) models and two pruned variants using pruning as per ( https://arxiv.org/abs/2002.08258 ) by Yonathan Aflalo 200 pretrained models in total now with updated results csv in results folder April 5, 2020 Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite 3.5M param MobileNet-V2 100 @ 73% 4.5M param MobileNet-V2 110d @ 75% 6.1M param MobileNet-V2 140 @ 76.5% 5.8M param MobileNet-V2 120d @ 77.3% March 18, 2020 Add EfficientNet-Lite models w/ weights ported from Tensorflow TPU Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by Andrew Lavin (see Training section for hparams)","title":"Recent Changes"},{"location":"changes/#recent-changes","text":"","title":"Recent Changes"},{"location":"changes/#aug-5-2020","text":"Universal feature extraction, new models, new weights, new test sets. All models support the features_only=True argument for create_model call to return a network that extracts features from the deepest layer at each stride. New models CSPResNet, CSPResNeXt, CSPDarkNet, DarkNet ReXNet (Aligned) Xception41/65/71 (a proper port of TF models) New trained weights SEResNet50 - 80.3 CSPDarkNet53 - 80.1 top-1 CSPResNeXt50 - 80.0 to-1 DPN68b - 79.2 top-1 EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by @hal-314) Add 'real' labels for ImageNet and ImageNet-Renditions test set, see results/README.md Train script and loader/transform tweaks to punch through more aug arguments README and documentation overhaul. See initial (WIP) documentation at https://rwightman.github.io/pytorch-image-models/","title":"Aug 5, 2020"},{"location":"changes/#june-11-2020","text":"Bunch of changes: DenseNet models updated with memory efficient addition from torchvision (fixed a bug), blur pooling and deep stem additions VoVNet V1 and V2 models added, 39 V2 variant (ese_vovnet_39b) trained to 79.3 top-1 Activation factory added along with new activations: select act at model creation time for more flexibility in using activations compatible with scripting or tracing (ONNX export) hard_mish (experimental) added with memory-efficient grad, along with ME hard_swish context mgr for setting exportable/scriptable/no_jit states Norm + Activation combo layers added with initial trial support in DenseNet and VoVNet along with impl of EvoNorm and InplaceAbn wrapper that fit the interface Torchscript works for all but two of the model types as long as using Pytorch 1.5+, tests added for this Some import cleanup and classifier reset changes, all models will have classifier reset to nn.Identity on reset_classifer(0) call Prep for 0.1.28 pip release","title":"June 11, 2020"},{"location":"changes/#may-12-2020","text":"Add ResNeSt models (code adapted from https://github.com/zhanghang1989/ResNeSt , paper https://arxiv.org/abs/2004.08955 ))","title":"May 12, 2020"},{"location":"changes/#may-3-2020","text":"Pruned EfficientNet B1, B2, and B3 ( https://arxiv.org/abs/2002.08258 ) contributed by Yonathan Aflalo","title":"May 3, 2020"},{"location":"changes/#may-1-2020","text":"Merged a number of execellent contributions in the ResNet model family over the past month BlurPool2D and resnetblur models initiated by Chris Ha , I trained resnetblur50 to 79.3. TResNet models and SpaceToDepth, AntiAliasDownsampleLayer layers by mrT23 ecaresnet (50d, 101d, light) models and two pruned variants using pruning as per ( https://arxiv.org/abs/2002.08258 ) by Yonathan Aflalo 200 pretrained models in total now with updated results csv in results folder","title":"May 1, 2020"},{"location":"changes/#april-5-2020","text":"Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite 3.5M param MobileNet-V2 100 @ 73% 4.5M param MobileNet-V2 110d @ 75% 6.1M param MobileNet-V2 140 @ 76.5% 5.8M param MobileNet-V2 120d @ 77.3%","title":"April 5, 2020"},{"location":"changes/#march-18-2020","text":"Add EfficientNet-Lite models w/ weights ported from Tensorflow TPU Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by Andrew Lavin (see Training section for hparams)","title":"March 18, 2020"},{"location":"feature_extraction/","text":"Feature Extraction All of the models in timm have consistent mechanisms for obtaining various types of features from the model for tasks besides classification. Penultimate Layer Features (Pre-Classifier Features) The features from the penultimate model layer can be obtained in severay ways without requiring model surgery (although feel free to do surgery). One must first decide if they want pooled or un-pooled features. Unpooled There are three ways to obtain unpooled features. Without modifying the network, one can call model.forward_features(input) on any model instead of the usual model(input) . This will bypass the head classifier and global pooling for networks. If one wants to explicitly modify the network to return unpooled features, they can either create the model without a classifier and pooling, or remove it later. Both paths remove the parameters associated with the classifier from the network. forward_features() import torch import timm m = timm . create_model ( 'xception41' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Original shape: {o.shape}' ) o = m . forward_features ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 2048, 10, 10]) Create with no classifier and pooling import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 , global_pool = '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Unpooled shape: torch.Size([2, 2048, 7, 7]) Remove it later import torch import timm m = timm . create_model ( 'densenet121' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 , '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 1024, 7, 7]) Pooled To modify the network to return pooled features, one can use forward_features() and pool/flatten the result themselves, or modify the network like above but keep pooling intact. Create with no classifier import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 2048]) Remove it later import torch import timm m = timm . create_model ( 'ese_vovnet19b_dw' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 1024]) Multi-scale Feature Maps (Feature Pyramid) Object detection, segmentation, keypoint, and a variety of dense pixel tasks require access to feature maps from the backbone network at multiple scales. This is often done by modifying the original classification network. Since each network varies quite a bit in structure, it's not uncommon to see only a few backbones supported in any given obj detection or segmentation library. timm allows a consistent interface for creating any of the included models as feature backbones that output feature maps for selected levels. A feature backbone can be created by adding the argument features_only=True to any create_model call. By default 5 strides will be output from most models (not all have that many), with the first starting at 2 (some start at 1 or 4). Create a feature map extraction model import torch import timm m = timm . create_model ( 'resnest26d' , features_only = True , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: torch.Size([2, 64, 112, 112]) torch.Size([2, 256, 56, 56]) torch.Size([2, 512, 28, 28]) torch.Size([2, 1024, 14, 14]) torch.Size([2, 2048, 7, 7]) Query the feature information After a feature backbone has been created, it can be queried to provide channel or resolution reduction information to the downstream heads without requiring static config or hardcoded constants. The .feature_info attribute is a class encapsulating the information about the feature extraction points. import torch import timm m = timm . create_model ( 'regnety_032' , features_only = True , pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: Feature channels: [32, 72, 216, 576, 1512] torch.Size([2, 32, 112, 112]) torch.Size([2, 72, 56, 56]) torch.Size([2, 216, 28, 28]) torch.Size([2, 576, 14, 14]) torch.Size([2, 1512, 7, 7]) Select specific feature levels or limit the stride There are to additional creation arguments impacting the output features. out_indices selects which indices to output output_stride limits the feature output stride of the network (also works in classification mode BTW) out_indices is supported by all models, but not all models have the same index to feature stride mapping. Look at the code or check feature_info to compare. The out indices generally correspond to the C(i+1)th feature level (a 2^(i+1) reduction). For most models, index 0 is the stride 2 features, and index 4 is stride 32. output_stride is achieved by converting layers to use dilated convolutions. Doing so is not always straightforward, some networks only support output_stride=32 . import torch import timm m = timm . create_model ( 'ecaresnet101d' , features_only = True , output_stride = 8 , out_indices = ( 2 , 4 ), pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) print ( f 'Feature reduction: {m.feature_info.reduction()}' ) o = m ( torch . randn ( 2 , 3 , 320 , 320 )) for x in o : print ( x . shape ) Output: Feature channels: [512, 2048] Feature reduction: [8, 8] torch.Size([2, 512, 40, 40]) torch.Size([2, 2048, 40, 40])","title":"Feature Extraction"},{"location":"feature_extraction/#feature-extraction","text":"All of the models in timm have consistent mechanisms for obtaining various types of features from the model for tasks besides classification.","title":"Feature Extraction"},{"location":"feature_extraction/#penultimate-layer-features-pre-classifier-features","text":"The features from the penultimate model layer can be obtained in severay ways without requiring model surgery (although feel free to do surgery). One must first decide if they want pooled or un-pooled features.","title":"Penultimate Layer Features (Pre-Classifier Features)"},{"location":"feature_extraction/#unpooled","text":"There are three ways to obtain unpooled features. Without modifying the network, one can call model.forward_features(input) on any model instead of the usual model(input) . This will bypass the head classifier and global pooling for networks. If one wants to explicitly modify the network to return unpooled features, they can either create the model without a classifier and pooling, or remove it later. Both paths remove the parameters associated with the classifier from the network.","title":"Unpooled"},{"location":"feature_extraction/#forward_features","text":"import torch import timm m = timm . create_model ( 'xception41' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Original shape: {o.shape}' ) o = m . forward_features ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 2048, 10, 10])","title":"forward_features()"},{"location":"feature_extraction/#create-with-no-classifier-and-pooling","text":"import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 , global_pool = '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Unpooled shape: torch.Size([2, 2048, 7, 7])","title":"Create with no classifier and pooling"},{"location":"feature_extraction/#remove-it-later","text":"import torch import timm m = timm . create_model ( 'densenet121' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 , '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 1024, 7, 7])","title":"Remove it later"},{"location":"feature_extraction/#pooled","text":"To modify the network to return pooled features, one can use forward_features() and pool/flatten the result themselves, or modify the network like above but keep pooling intact.","title":"Pooled"},{"location":"feature_extraction/#create-with-no-classifier","text":"import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 2048])","title":"Create with no classifier"},{"location":"feature_extraction/#remove-it-later_1","text":"import torch import timm m = timm . create_model ( 'ese_vovnet19b_dw' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 1024])","title":"Remove it later"},{"location":"feature_extraction/#multi-scale-feature-maps-feature-pyramid","text":"Object detection, segmentation, keypoint, and a variety of dense pixel tasks require access to feature maps from the backbone network at multiple scales. This is often done by modifying the original classification network. Since each network varies quite a bit in structure, it's not uncommon to see only a few backbones supported in any given obj detection or segmentation library. timm allows a consistent interface for creating any of the included models as feature backbones that output feature maps for selected levels. A feature backbone can be created by adding the argument features_only=True to any create_model call. By default 5 strides will be output from most models (not all have that many), with the first starting at 2 (some start at 1 or 4).","title":"Multi-scale Feature Maps (Feature Pyramid)"},{"location":"feature_extraction/#create-a-feature-map-extraction-model","text":"import torch import timm m = timm . create_model ( 'resnest26d' , features_only = True , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: torch.Size([2, 64, 112, 112]) torch.Size([2, 256, 56, 56]) torch.Size([2, 512, 28, 28]) torch.Size([2, 1024, 14, 14]) torch.Size([2, 2048, 7, 7])","title":"Create a feature map extraction model"},{"location":"feature_extraction/#query-the-feature-information","text":"After a feature backbone has been created, it can be queried to provide channel or resolution reduction information to the downstream heads without requiring static config or hardcoded constants. The .feature_info attribute is a class encapsulating the information about the feature extraction points. import torch import timm m = timm . create_model ( 'regnety_032' , features_only = True , pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: Feature channels: [32, 72, 216, 576, 1512] torch.Size([2, 32, 112, 112]) torch.Size([2, 72, 56, 56]) torch.Size([2, 216, 28, 28]) torch.Size([2, 576, 14, 14]) torch.Size([2, 1512, 7, 7])","title":"Query the feature information"},{"location":"feature_extraction/#select-specific-feature-levels-or-limit-the-stride","text":"There are to additional creation arguments impacting the output features. out_indices selects which indices to output output_stride limits the feature output stride of the network (also works in classification mode BTW) out_indices is supported by all models, but not all models have the same index to feature stride mapping. Look at the code or check feature_info to compare. The out indices generally correspond to the C(i+1)th feature level (a 2^(i+1) reduction). For most models, index 0 is the stride 2 features, and index 4 is stride 32. output_stride is achieved by converting layers to use dilated convolutions. Doing so is not always straightforward, some networks only support output_stride=32 . import torch import timm m = timm . create_model ( 'ecaresnet101d' , features_only = True , output_stride = 8 , out_indices = ( 2 , 4 ), pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) print ( f 'Feature reduction: {m.feature_info.reduction()}' ) o = m ( torch . randn ( 2 , 3 , 320 , 320 )) for x in o : print ( x . shape ) Output: Feature channels: [512, 2048] Feature reduction: [8, 8] torch.Size([2, 512, 40, 40]) torch.Size([2, 2048, 40, 40])","title":"Select specific feature levels or limit the stride"},{"location":"models/","text":"Model Architectures The model architectures included come from a wide variety of sources. Sources, including papers, original impl (\"reference code\") that I rewrote / adapted, and PyTorch impl that I leveraged directly (\"code\") are listed below. Most included models have pretrained weights. The weights are either: from their original sources ported by myself from their original impl in a different framework (e.g. Tensorflow models) trained from scratch using the included training script The validation results for the pretrained weights can be found here Cross-Stage Partial Networks [ cspnet.py ] Paper: CSPNet: A New Backbone that can Enhance Learning Capability of CNN - https://arxiv.org/abs/1911.11929 Reference impl: https://github.com/WongKinYiu/CrossStagePartialNetworks DenseNet [ densenet.py ] Paper: Densely Connected Convolutional Networks - https://arxiv.org/abs/1608.06993 Code: https://github.com/pytorch/vision/tree/master/torchvision/models DLA [ dla.py ] Paper: https://arxiv.org/abs/1707.06484 Code: https://github.com/ucbdrive/dla Dual-Path Networks [ dpn.py ] Paper: Dual Path Networks - https://arxiv.org/abs/1707.01629 My PyTorch code: https://github.com/rwightman/pytorch-dpn-pretrained Reference code: https://github.com/cypw/DPNs HRNet [ hrnet.py ] Paper: Deep High-Resolution Representation Learning for Visual Recognition - https://arxiv.org/abs/1908.07919 Code: https://github.com/HRNet/HRNet-Image-Classification Inception-V3 [ inception_v3.py ] Paper: Rethinking the Inception Architecture for Computer Vision - https://arxiv.org/abs/1512.00567 Code: https://github.com/pytorch/vision/tree/master/torchvision/models Inception-V4 [ inception_v4.py ] Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets Inception-ResNet-V2 [ inception_resnet_v2.py ] Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets NASNet-A [ nasnet.py ] Papers: Learning Transferable Architectures for Scalable Image Recognition - https://arxiv.org/abs/1707.07012 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet PNasNet-5 [ pnasnet.py ] Papers: Progressive Neural Architecture Search - https://arxiv.org/abs/1712.00559 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet EfficientNet [ efficientnet.py ] Papers: EfficientNet NoisyStudent (B0-B7, L2) - https://arxiv.org/abs/1911.04252 EfficientNet AdvProp (B0-B8) - https://arxiv.org/abs/1911.09665 EfficientNet (B0-B7) - https://arxiv.org/abs/1905.11946 EfficientNet-EdgeTPU (S, M, L) - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html MixNet - https://arxiv.org/abs/1907.09595 MNASNet B1, A1 (Squeeze-Excite), and Small - https://arxiv.org/abs/1807.11626 MobileNet-V2 - https://arxiv.org/abs/1801.04381 FBNet-C - https://arxiv.org/abs/1812.03443 Single-Path NAS - https://arxiv.org/abs/1904.02877 My PyTorch code: https://github.com/rwightman/gen-efficientnet-pytorch Reference code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet MobileNet-V3 [ mobilenetv3.py ] Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet RegNet [ regnet.py ] Paper: Designing Network Design Spaces - https://arxiv.org/abs/2003.13678 Reference code: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py ResNet, ResNeXt [ resnet.py ] ResNet (V1B) Paper: Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 Code: https://github.com/pytorch/vision/tree/master/torchvision/models ResNeXt Paper: Aggregated Residual Transformations for Deep Neural Networks - https://arxiv.org/abs/1611.05431 Code: https://github.com/pytorch/vision/tree/master/torchvision/models 'Bag of Tricks' / Gluon C, D, E, S ResNet variants Paper: Bag of Tricks for Image Classification with CNNs - https://arxiv.org/abs/1812.01187 Code: https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnetv1b.py Instagram pretrained / ImageNet tuned ResNeXt101 Paper: Exploring the Limits of Weakly Supervised Pretraining - https://arxiv.org/abs/1805.00932 Weights: https://pytorch.org/hub/facebookresearch_WSL-Images_resnext (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Semi-supervised (SSL) / Semi-weakly Supervised (SWSL) ResNet and ResNeXts Paper: Billion-scale semi-supervised learning for image classification - https://arxiv.org/abs/1905.00546 Weights: https://github.com/facebookresearch/semi-supervised-ImageNet1K-models (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Squeeze-and-Excitation Networks Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: Added to ResNet base, this is current version going forward, old senet.py is being deprecated ECAResNet (ECA-Net) Paper: ECA-Net: Efficient Channel Attention for Deep CNN - https://arxiv.org/abs/1910.03151v4 Code: Added to ResNet base, ECA module contributed by @VRandme, reference https://github.com/BangguWu/ECANet Res2Net [ res2net.py ] Paper: Res2Net: A New Multi-scale Backbone Architecture - https://arxiv.org/abs/1904.01169 Code: https://github.com/gasvn/Res2Net ResNeSt [ resnest.py ] Paper: ResNeSt: Split-Attention Networks - https://arxiv.org/abs/2004.08955 Code: https://github.com/zhanghang1989/ResNeSt ReXNet [ rexnet.py ] Paper: ReXNet: Diminishing Representational Bottleneck on CNN - https://arxiv.org/abs/2007.00992 Code: https://github.com/clovaai/rexnet Selective-Kernel Networks [ sknet.py ] Paper: Selective-Kernel Networks - https://arxiv.org/abs/1903.06586 Code: https://github.com/implus/SKNet , https://github.com/clovaai/assembled-cnn SelecSLS [ selecsls.py ] Paper: XNect: Real-time Multi-Person 3D Motion Capture with a Single RGB Camera - https://arxiv.org/abs/1907.00837 Code: https://github.com/mehtadushy/SelecSLS-Pytorch Squeeze-and-Excitation Networks [ senet.py ] NOTE: I am deprecating this version of the networks, the new ones are part of resnet.py Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: https://github.com/Cadene/pretrained-models.pytorch TResNet [ tresnet.py ] Paper: TResNet: High Performance GPU-Dedicated Architecture - https://arxiv.org/abs/2003.13630 Code: https://github.com/mrT23/TResNet VovNet V2 and V1 [ vovnet.py ] Paper: CenterMask : Real-Time Anchor-Free Instance Segmentation - https://arxiv.org/abs/1911.06667 Reference code: https://github.com/youngwanLEE/vovnet-detectron2 Xception [ xception.py ] Paper: Xception: Deep Learning with Depthwise Separable Convolutions - https://arxiv.org/abs/1610.02357 Code: https://github.com/Cadene/pretrained-models.pytorch Xception (Modified Aligned, Gluon) [ gluon_xception.py ] Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/dmlc/gluon-cv/tree/master/gluoncv/model_zoo , https://github.com/jfzhang95/pytorch-deeplab-xception/ Xception (Modified Aligned, TF) [ aligned_xception.py ] Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/tensorflow/models/tree/master/research/deeplab","title":"Model Architectures"},{"location":"models/#model-architectures","text":"The model architectures included come from a wide variety of sources. Sources, including papers, original impl (\"reference code\") that I rewrote / adapted, and PyTorch impl that I leveraged directly (\"code\") are listed below. Most included models have pretrained weights. The weights are either: from their original sources ported by myself from their original impl in a different framework (e.g. Tensorflow models) trained from scratch using the included training script The validation results for the pretrained weights can be found here","title":"Model Architectures"},{"location":"models/#cross-stage-partial-networks-cspnetpy","text":"Paper: CSPNet: A New Backbone that can Enhance Learning Capability of CNN - https://arxiv.org/abs/1911.11929 Reference impl: https://github.com/WongKinYiu/CrossStagePartialNetworks","title":"Cross-Stage Partial Networks [cspnet.py]"},{"location":"models/#densenet-densenetpy","text":"Paper: Densely Connected Convolutional Networks - https://arxiv.org/abs/1608.06993 Code: https://github.com/pytorch/vision/tree/master/torchvision/models","title":"DenseNet [densenet.py]"},{"location":"models/#dla-dlapy","text":"Paper: https://arxiv.org/abs/1707.06484 Code: https://github.com/ucbdrive/dla","title":"DLA [dla.py]"},{"location":"models/#dual-path-networks-dpnpy","text":"Paper: Dual Path Networks - https://arxiv.org/abs/1707.01629 My PyTorch code: https://github.com/rwightman/pytorch-dpn-pretrained Reference code: https://github.com/cypw/DPNs","title":"Dual-Path Networks [dpn.py]"},{"location":"models/#hrnet-hrnetpy","text":"Paper: Deep High-Resolution Representation Learning for Visual Recognition - https://arxiv.org/abs/1908.07919 Code: https://github.com/HRNet/HRNet-Image-Classification","title":"HRNet [hrnet.py]"},{"location":"models/#inception-v3-inception_v3py","text":"Paper: Rethinking the Inception Architecture for Computer Vision - https://arxiv.org/abs/1512.00567 Code: https://github.com/pytorch/vision/tree/master/torchvision/models","title":"Inception-V3 [inception_v3.py]"},{"location":"models/#inception-v4-inception_v4py","text":"Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets","title":"Inception-V4 [inception_v4.py]"},{"location":"models/#inception-resnet-v2-inception_resnet_v2py","text":"Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets","title":"Inception-ResNet-V2 [inception_resnet_v2.py]"},{"location":"models/#nasnet-a-nasnetpy","text":"Papers: Learning Transferable Architectures for Scalable Image Recognition - https://arxiv.org/abs/1707.07012 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet","title":"NASNet-A [nasnet.py]"},{"location":"models/#pnasnet-5-pnasnetpy","text":"Papers: Progressive Neural Architecture Search - https://arxiv.org/abs/1712.00559 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet","title":"PNasNet-5 [pnasnet.py]"},{"location":"models/#efficientnet-efficientnetpy","text":"Papers: EfficientNet NoisyStudent (B0-B7, L2) - https://arxiv.org/abs/1911.04252 EfficientNet AdvProp (B0-B8) - https://arxiv.org/abs/1911.09665 EfficientNet (B0-B7) - https://arxiv.org/abs/1905.11946 EfficientNet-EdgeTPU (S, M, L) - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html MixNet - https://arxiv.org/abs/1907.09595 MNASNet B1, A1 (Squeeze-Excite), and Small - https://arxiv.org/abs/1807.11626 MobileNet-V2 - https://arxiv.org/abs/1801.04381 FBNet-C - https://arxiv.org/abs/1812.03443 Single-Path NAS - https://arxiv.org/abs/1904.02877 My PyTorch code: https://github.com/rwightman/gen-efficientnet-pytorch Reference code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet","title":"EfficientNet [efficientnet.py]"},{"location":"models/#mobilenet-v3-mobilenetv3py","text":"Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet","title":"MobileNet-V3 [mobilenetv3.py]"},{"location":"models/#regnet-regnetpy","text":"Paper: Designing Network Design Spaces - https://arxiv.org/abs/2003.13678 Reference code: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py","title":"RegNet [regnet.py]"},{"location":"models/#resnet-resnext-resnetpy","text":"ResNet (V1B) Paper: Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 Code: https://github.com/pytorch/vision/tree/master/torchvision/models ResNeXt Paper: Aggregated Residual Transformations for Deep Neural Networks - https://arxiv.org/abs/1611.05431 Code: https://github.com/pytorch/vision/tree/master/torchvision/models 'Bag of Tricks' / Gluon C, D, E, S ResNet variants Paper: Bag of Tricks for Image Classification with CNNs - https://arxiv.org/abs/1812.01187 Code: https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnetv1b.py Instagram pretrained / ImageNet tuned ResNeXt101 Paper: Exploring the Limits of Weakly Supervised Pretraining - https://arxiv.org/abs/1805.00932 Weights: https://pytorch.org/hub/facebookresearch_WSL-Images_resnext (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Semi-supervised (SSL) / Semi-weakly Supervised (SWSL) ResNet and ResNeXts Paper: Billion-scale semi-supervised learning for image classification - https://arxiv.org/abs/1905.00546 Weights: https://github.com/facebookresearch/semi-supervised-ImageNet1K-models (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Squeeze-and-Excitation Networks Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: Added to ResNet base, this is current version going forward, old senet.py is being deprecated ECAResNet (ECA-Net) Paper: ECA-Net: Efficient Channel Attention for Deep CNN - https://arxiv.org/abs/1910.03151v4 Code: Added to ResNet base, ECA module contributed by @VRandme, reference https://github.com/BangguWu/ECANet","title":"ResNet, ResNeXt [resnet.py]"},{"location":"models/#res2net-res2netpy","text":"Paper: Res2Net: A New Multi-scale Backbone Architecture - https://arxiv.org/abs/1904.01169 Code: https://github.com/gasvn/Res2Net","title":"Res2Net [res2net.py]"},{"location":"models/#resnest-resnestpy","text":"Paper: ResNeSt: Split-Attention Networks - https://arxiv.org/abs/2004.08955 Code: https://github.com/zhanghang1989/ResNeSt","title":"ResNeSt [resnest.py]"},{"location":"models/#rexnet-rexnetpy","text":"Paper: ReXNet: Diminishing Representational Bottleneck on CNN - https://arxiv.org/abs/2007.00992 Code: https://github.com/clovaai/rexnet","title":"ReXNet [rexnet.py]"},{"location":"models/#selective-kernel-networks-sknetpy","text":"Paper: Selective-Kernel Networks - https://arxiv.org/abs/1903.06586 Code: https://github.com/implus/SKNet , https://github.com/clovaai/assembled-cnn","title":"Selective-Kernel Networks [sknet.py]"},{"location":"models/#selecsls-selecslspy","text":"Paper: XNect: Real-time Multi-Person 3D Motion Capture with a Single RGB Camera - https://arxiv.org/abs/1907.00837 Code: https://github.com/mehtadushy/SelecSLS-Pytorch","title":"SelecSLS [selecsls.py]"},{"location":"models/#squeeze-and-excitation-networks-senetpy","text":"NOTE: I am deprecating this version of the networks, the new ones are part of resnet.py Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: https://github.com/Cadene/pretrained-models.pytorch","title":"Squeeze-and-Excitation Networks [senet.py]"},{"location":"models/#tresnet-tresnetpy","text":"Paper: TResNet: High Performance GPU-Dedicated Architecture - https://arxiv.org/abs/2003.13630 Code: https://github.com/mrT23/TResNet","title":"TResNet [tresnet.py]"},{"location":"models/#vovnet-v2-and-v1-vovnetpy","text":"Paper: CenterMask : Real-Time Anchor-Free Instance Segmentation - https://arxiv.org/abs/1911.06667 Reference code: https://github.com/youngwanLEE/vovnet-detectron2","title":"VovNet V2 and V1 [vovnet.py]"},{"location":"models/#xception-xceptionpy","text":"Paper: Xception: Deep Learning with Depthwise Separable Convolutions - https://arxiv.org/abs/1610.02357 Code: https://github.com/Cadene/pretrained-models.pytorch","title":"Xception [xception.py]"},{"location":"models/#xception-modified-aligned-gluon-gluon_xceptionpy","text":"Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/dmlc/gluon-cv/tree/master/gluoncv/model_zoo , https://github.com/jfzhang95/pytorch-deeplab-xception/","title":"Xception (Modified Aligned, Gluon) [gluon_xception.py]"},{"location":"models/#xception-modified-aligned-tf-aligned_xceptionpy","text":"Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/tensorflow/models/tree/master/research/deeplab","title":"Xception (Modified Aligned, TF) [aligned_xception.py]"},{"location":"results/","text":"Results CSV files containing an ImageNet-1K validation and OOD test set validation results for all included models with pretrained weights and default configurations is located here . Self-trained Weights I've leveraged the training scripts in this repository to train a few of the models with to good levels of performance. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size efficientnet_b3a 81.874 (18.126) 95.840 (4.160) 12.23 bicubic 320 (1.0 crop) efficientnet_b3 81.498 (18.502) 95.718 (4.282) 12.23 bicubic 300 skresnext50d_32x4d 81.278 (18.722) 95.366 (4.634) 27.5 bicubic 288 (1.0 crop) efficientnet_b2a 80.608 (19.392) 95.310 (4.690) 9.11 bicubic 288 (1.0 crop) mixnet_xl 80.478 (19.522) 94.932 (5.068) 11.90 bicubic 224 efficientnet_b2 80.402 (19.598) 95.076 (4.924) 9.11 bicubic 260 skresnext50d_32x4d 80.156 (19.844) 94.642 (5.358) 27.5 bicubic 224 resnext50_32x4d 79.762 (20.238) 94.600 (5.400) 25 bicubic 224 resnext50d_32x4d 79.674 (20.326) 94.868 (5.132) 25.1 bicubic 224 ese_vovnet39b 79.320 (20.680) 94.710 (5.290) 24.6 bicubic 224 resnetblur50 79.290 (20.710) 94.632 (5.368) 25.6 bicubic 224 resnet50 79.038 (20.962) 94.390 (5.610) 25.6 bicubic 224 mixnet_l 78.976 (21.024 94.184 (5.816) 7.33 bicubic 224 efficientnet_b1 78.692 (21.308) 94.086 (5.914) 7.79 bicubic 240 efficientnet_es 78.066 (21.934) 93.926 (6.074) 5.44 bicubic 224 seresnext26t_32x4d 77.998 (22.002) 93.708 (6.292) 16.8 bicubic 224 seresnext26tn_32x4d 77.986 (22.014) 93.746 (6.254) 16.8 bicubic 224 efficientnet_b0 77.698 (22.302) 93.532 (6.468) 5.29 bicubic 224 seresnext26d_32x4d 77.602 (22.398) 93.608 (6.392) 16.8 bicubic 224 mobilenetv2_120d 77.294 (22.706 93.502 (6.498) 5.8 bicubic 224 mixnet_m 77.256 (22.744) 93.418 (6.582) 5.01 bicubic 224 seresnext26_32x4d 77.104 (22.896) 93.316 (6.684) 16.8 bicubic 224 skresnet34 76.912 (23.088) 93.322 (6.678) 22.2 bicubic 224 ese_vovnet19b_dw 76.798 (23.202) 93.268 (6.732) 6.5 bicubic 224 resnet26d 76.68 (23.32) 93.166 (6.834) 16 bicubic 224 densenetblur121d 76.576 (23.424) 93.190 (6.810) 8.0 bicubic 224 mobilenetv2_140 76.524 (23.476) 92.990 (7.010) 6.1 bicubic 224 mixnet_s 75.988 (24.012) 92.794 (7.206) 4.13 bicubic 224 mobilenetv3_large_100 75.766 (24.234) 92.542 (7.458) 5.5 bicubic 224 mobilenetv3_rw 75.634 (24.366) 92.708 (7.292) 5.5 bicubic 224 mnasnet_a1 75.448 (24.552) 92.604 (7.396) 3.89 bicubic 224 resnet26 75.292 (24.708) 92.57 (7.43) 16 bicubic 224 fbnetc_100 75.124 (24.876) 92.386 (7.614) 5.6 bilinear 224 resnet34 75.110 (24.890) 92.284 (7.716) 22 bilinear 224 mobilenetv2_110d 75.052 (24.948) 92.180 (7.820) 4.5 bicubic 224 seresnet34 74.808 (25.192) 92.124 (7.876) 22 bilinear 224 mnasnet_b1 74.658 (25.342) 92.114 (7.886) 4.38 bicubic 224 spnasnet_100 74.084 (25.916) 91.818 (8.182) 4.42 bilinear 224 skresnet18 73.038 (26.962) 91.168 (8.832) 11.9 bicubic 224 mobilenetv2_100 72.978 (27.022) 91.016 (8.984) 3.5 bicubic 224 seresnet18 71.742 (28.258) 90.334 (9.666) 11.8 bicubic 224 Ported Weights For the models below, the model code and weight porting from Tensorflow or MXNet Gluon to Pytorch was done by myself. There are weights/models ported by others included in this repository, they are not listed below. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size tf_efficientnet_l2_ns *tfp 88.352 (11.648) 98.652 (1.348) 480 bicubic 800 tf_efficientnet_l2_ns TBD TBD 480 bicubic 800 tf_efficientnet_l2_ns_475 88.234 (11.766) 98.546 (1.454)f 480 bicubic 475 tf_efficientnet_l2_ns_475 *tfp 88.172 (11.828) 98.566 (1.434) 480 bicubic 475 tf_efficientnet_b7_ns *tfp 86.844 (13.156) 98.084 (1.916) 66.35 bicubic 600 tf_efficientnet_b7_ns 86.840 (13.160) 98.094 (1.906) 66.35 bicubic 600 tf_efficientnet_b6_ns 86.452 (13.548) 97.882 (2.118) 43.04 bicubic 528 tf_efficientnet_b6_ns *tfp 86.444 (13.556) 97.880 (2.120) 43.04 bicubic 528 tf_efficientnet_b5_ns *tfp 86.064 (13.936) 97.746 (2.254) 30.39 bicubic 456 tf_efficientnet_b5_ns 86.088 (13.912) 97.752 (2.248) 30.39 bicubic 456 tf_efficientnet_b8_ap *tfp 85.436 (14.564) 97.272 (2.728) 87.4 bicubic 672 tf_efficientnet_b8 *tfp 85.384 (14.616) 97.394 (2.606) 87.4 bicubic 672 tf_efficientnet_b8 85.370 (14.630) 97.390 (2.610) 87.4 bicubic 672 tf_efficientnet_b8_ap 85.368 (14.632) 97.294 (2.706) 87.4 bicubic 672 tf_efficientnet_b4_ns *tfp 85.298 (14.702) 97.504 (2.496) 19.34 bicubic 380 tf_efficientnet_b4_ns 85.162 (14.838) 97.470 (2.530) 19.34 bicubic 380 tf_efficientnet_b7_ap *tfp 85.154 (14.846) 97.244 (2.756) 66.35 bicubic 600 tf_efficientnet_b7_ap 85.118 (14.882) 97.252 (2.748) 66.35 bicubic 600 tf_efficientnet_b7 *tfp 84.940 (15.060) 97.214 (2.786) 66.35 bicubic 600 tf_efficientnet_b7 84.932 (15.068) 97.208 (2.792) 66.35 bicubic 600 tf_efficientnet_b6_ap 84.786 (15.214) 97.138 (2.862) 43.04 bicubic 528 tf_efficientnet_b6_ap *tfp 84.760 (15.240) 97.124 (2.876) 43.04 bicubic 528 tf_efficientnet_b5_ap *tfp 84.276 (15.724) 96.932 (3.068) 30.39 bicubic 456 tf_efficientnet_b5_ap 84.254 (15.746) 96.976 (3.024) 30.39 bicubic 456 tf_efficientnet_b6 *tfp 84.140 (15.860) 96.852 (3.148) 43.04 bicubic 528 tf_efficientnet_b6 84.110 (15.890) 96.886 (3.114) 43.04 bicubic 528 tf_efficientnet_b3_ns *tfp 84.054 (15.946) 96.918 (3.082) 12.23 bicubic 300 tf_efficientnet_b3_ns 84.048 (15.952) 96.910 (3.090) 12.23 bicubic 300 tf_efficientnet_b5 *tfp 83.822 (16.178) 96.756 (3.244) 30.39 bicubic 456 tf_efficientnet_b5 83.812 (16.188) 96.748 (3.252) 30.39 bicubic 456 tf_efficientnet_b4_ap *tfp 83.278 (16.722) 96.376 (3.624) 19.34 bicubic 380 tf_efficientnet_b4_ap 83.248 (16.752) 96.388 (3.612) 19.34 bicubic 380 tf_efficientnet_b4 83.022 (16.978) 96.300 (3.700) 19.34 bicubic 380 tf_efficientnet_b4 *tfp 82.948 (17.052) 96.308 (3.692) 19.34 bicubic 380 tf_efficientnet_b2_ns *tfp 82.436 (17.564) 96.268 (3.732) 9.11 bicubic 260 tf_efficientnet_b2_ns 82.380 (17.620) 96.248 (3.752) 9.11 bicubic 260 tf_efficientnet_b3_ap *tfp 81.882 (18.118) 95.662 (4.338) 12.23 bicubic 300 tf_efficientnet_b3_ap 81.828 (18.172) 95.624 (4.376) 12.23 bicubic 300 tf_efficientnet_b3 81.636 (18.364) 95.718 (4.282) 12.23 bicubic 300 tf_efficientnet_b3 *tfp 81.576 (18.424) 95.662 (4.338) 12.23 bicubic 300 tf_efficientnet_lite4 81.528 (18.472) 95.668 (4.332) 13.00 bilinear 380 tf_efficientnet_b1_ns *tfp 81.514 (18.486) 95.776 (4.224) 7.79 bicubic 240 tf_efficientnet_lite4 *tfp 81.502 (18.498) 95.676 (4.324) 13.00 bilinear 380 tf_efficientnet_b1_ns 81.388 (18.612) 95.738 (4.262) 7.79 bicubic 240 gluon_senet154 81.224 (18.776) 95.356 (4.644) 115.09 bicubic 224 gluon_resnet152_v1s 81.012 (18.988) 95.416 (4.584) 60.32 bicubic 224 gluon_seresnext101_32x4d 80.902 (19.098) 95.294 (4.706) 48.96 bicubic 224 gluon_seresnext101_64x4d 80.890 (19.110) 95.304 (4.696) 88.23 bicubic 224 gluon_resnext101_64x4d 80.602 (19.398) 94.994 (5.006) 83.46 bicubic 224 tf_efficientnet_el 80.534 (19.466) 95.190 (4.810) 10.59 bicubic 300 tf_efficientnet_el *tfp 80.476 (19.524) 95.200 (4.800) 10.59 bicubic 300 gluon_resnet152_v1d 80.470 (19.530) 95.206 (4.794) 60.21 bicubic 224 gluon_resnet101_v1d 80.424 (19.576) 95.020 (4.980) 44.57 bicubic 224 tf_efficientnet_b2_ap *tfp 80.420 (19.580) 95.040 (4.960) 9.11 bicubic 260 gluon_resnext101_32x4d 80.334 (19.666) 94.926 (5.074) 44.18 bicubic 224 tf_efficientnet_b2_ap 80.306 (19.694) 95.028 (4.972) 9.11 bicubic 260 gluon_resnet101_v1s 80.300 (19.700) 95.150 (4.850) 44.67 bicubic 224 tf_efficientnet_b2 *tfp 80.188 (19.812) 94.974 (5.026) 9.11 bicubic 260 tf_efficientnet_b2 80.086 (19.914) 94.908 (5.092) 9.11 bicubic 260 gluon_resnet152_v1c 79.916 (20.084) 94.842 (5.158) 60.21 bicubic 224 gluon_seresnext50_32x4d 79.912 (20.088) 94.818 (5.182) 27.56 bicubic 224 tf_efficientnet_lite3 79.812 (20.188) 94.914 (5.086) 8.20 bilinear 300 tf_efficientnet_lite3 *tfp 79.734 (20.266) 94.838 (5.162) 8.20 bilinear 300 gluon_resnet152_v1b 79.692 (20.308) 94.738 (5.262) 60.19 bicubic 224 gluon_xception65 79.604 (20.396) 94.748 (5.252) 39.92 bicubic 299 gluon_resnet101_v1c 79.544 (20.456) 94.586 (5.414) 44.57 bicubic 224 tf_efficientnet_b1_ap *tfp 79.532 (20.468) 94.378 (5.622) 7.79 bicubic 240 tf_efficientnet_cc_b1_8e *tfp 79.464 (20.536) 94.492 (5.508) 39.7 bicubic 240 gluon_resnext50_32x4d 79.356 (20.644) 94.424 (5.576) 25.03 bicubic 224 gluon_resnet101_v1b 79.304 (20.696) 94.524 (5.476) 44.55 bicubic 224 tf_efficientnet_cc_b1_8e 79.298 (20.702) 94.364 (5.636) 39.7 bicubic 240 tf_efficientnet_b1_ap 79.278 (20.722) 94.308 (5.692) 7.79 bicubic 240 tf_efficientnet_b1 *tfp 79.172 (20.828) 94.450 (5.550) 7.79 bicubic 240 gluon_resnet50_v1d 79.074 (20.926) 94.476 (5.524) 25.58 bicubic 224 tf_efficientnet_em *tfp 78.958 (21.042) 94.458 (5.542) 6.90 bicubic 240 tf_mixnet_l *tfp 78.846 (21.154) 94.212 (5.788) 7.33 bilinear 224 tf_efficientnet_b1 78.826 (21.174) 94.198 (5.802) 7.79 bicubic 240 tf_efficientnet_b0_ns *tfp 78.806 (21.194) 94.496 (5.504) 5.29 bicubic 224 gluon_inception_v3 78.804 (21.196) 94.380 (5.620) 27.16M bicubic 299 tf_mixnet_l 78.770 (21.230) 94.004 (5.996) 7.33 bicubic 224 tf_efficientnet_em 78.742 (21.258) 94.332 (5.668) 6.90 bicubic 240 gluon_resnet50_v1s 78.712 (21.288) 94.242 (5.758) 25.68 bicubic 224 tf_efficientnet_b0_ns 78.658 (21.342) 94.376 (5.624) 5.29 bicubic 224 tf_efficientnet_cc_b0_8e *tfp 78.314 (21.686) 93.790 (6.210) 24.0 bicubic 224 gluon_resnet50_v1c 78.010 (21.990) 93.988 (6.012) 25.58 bicubic 224 tf_efficientnet_cc_b0_8e 77.908 (22.092) 93.656 (6.344) 24.0 bicubic 224 tf_inception_v3 77.856 (22.144) 93.644 (6.356) 27.16M bicubic 299 tf_efficientnet_cc_b0_4e *tfp 77.746 (22.254) 93.552 (6.448) 13.3 bicubic 224 tf_efficientnet_es *tfp 77.616 (22.384) 93.750 (6.250) 5.44 bicubic 224 gluon_resnet50_v1b 77.578 (22.422) 93.718 (6.282) 25.56 bicubic 224 adv_inception_v3 77.576 (22.424) 93.724 (6.276) 27.16M bicubic 299 tf_efficientnet_lite2 *tfp 77.544 (22.456) 93.800 (6.200) 6.09 bilinear 260 tf_efficientnet_lite2 77.460 (22.540) 93.746 (6.254) 6.09 bicubic 260 tf_efficientnet_b0_ap *tfp 77.514 (22.486) 93.576 (6.424) 5.29 bicubic 224 tf_efficientnet_cc_b0_4e 77.304 (22.696) 93.332 (6.668) 13.3 bicubic 224 tf_efficientnet_es 77.264 (22.736) 93.600 (6.400) 5.44 bicubic 224 tf_efficientnet_b0 *tfp 77.258 (22.742) 93.478 (6.522) 5.29 bicubic 224 tf_efficientnet_b0_ap 77.084 (22.916) 93.254 (6.746) 5.29 bicubic 224 tf_mixnet_m *tfp 77.072 (22.928) 93.368 (6.632) 5.01 bilinear 224 tf_mixnet_m 76.950 (23.050) 93.156 (6.844) 5.01 bicubic 224 tf_efficientnet_b0 76.848 (23.152) 93.228 (6.772) 5.29 bicubic 224 tf_efficientnet_lite1 *tfp 76.764 (23.236) 93.326 (6.674) 5.42 bilinear 240 tf_efficientnet_lite1 76.638 (23.362) 93.232 (6.768) 5.42 bicubic 240 tf_mixnet_s *tfp 75.800 (24.200) 92.788 (7.212) 4.13 bilinear 224 tf_mobilenetv3_large_100 *tfp 75.768 (24.232) 92.710 (7.290) 5.48 bilinear 224 tf_mixnet_s 75.648 (24.352) 92.636 (7.364) 4.13 bicubic 224 tf_mobilenetv3_large_100 75.516 (24.484) 92.600 (7.400) 5.48 bilinear 224 tf_efficientnet_lite0 *tfp 75.074 (24.926) 92.314 (7.686) 4.65 bilinear 224 tf_efficientnet_lite0 74.842 (25.158) 92.170 (7.830) 4.65 bicubic 224 gluon_resnet34_v1b 74.580 (25.420) 91.988 (8.012) 21.80 bicubic 224 tf_mobilenetv3_large_075 *tfp 73.730 (26.270) 91.616 (8.384) 3.99 bilinear 224 tf_mobilenetv3_large_075 73.442 (26.558) 91.352 (8.648) 3.99 bilinear 224 tf_mobilenetv3_large_minimal_100 *tfp 72.678 (27.322) 90.860 (9.140) 3.92 bilinear 224 tf_mobilenetv3_large_minimal_100 72.244 (27.756) 90.636 (9.364) 3.92 bilinear 224 tf_mobilenetv3_small_100 *tfp 67.918 (32.082) 87.958 (12.042 2.54 bilinear 224 tf_mobilenetv3_small_100 67.918 (32.082) 87.662 (12.338) 2.54 bilinear 224 tf_mobilenetv3_small_075 *tfp 66.142 (33.858) 86.498 (13.502) 2.04 bilinear 224 tf_mobilenetv3_small_075 65.718 (34.282) 86.136 (13.864) 2.04 bilinear 224 tf_mobilenetv3_small_minimal_100 *tfp 63.378 (36.622) 84.802 (15.198) 2.04 bilinear 224 tf_mobilenetv3_small_minimal_100 62.898 (37.102) 84.230 (15.770) 2.04 bilinear 224 Models with *tfp next to them were scored with --tf-preprocessing flag. The tf_efficientnet , tf_mixnet models require an equivalent for 'SAME' padding as their arch results in asymmetric padding. I've added this in the model creation wrapper, but it does come with a performance penalty. Sources for original weights: * tf_efficientnet* : Tensorflow TPU * tf_efficientnet_e* : Tensorflow TPU * tf_mixnet* : Tensorflow TPU * tf_inception* : Tensorflow Slim * gluon_* : MxNet Gluon","title":"Results"},{"location":"results/#results","text":"CSV files containing an ImageNet-1K validation and OOD test set validation results for all included models with pretrained weights and default configurations is located here .","title":"Results"},{"location":"results/#self-trained-weights","text":"I've leveraged the training scripts in this repository to train a few of the models with to good levels of performance. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size efficientnet_b3a 81.874 (18.126) 95.840 (4.160) 12.23 bicubic 320 (1.0 crop) efficientnet_b3 81.498 (18.502) 95.718 (4.282) 12.23 bicubic 300 skresnext50d_32x4d 81.278 (18.722) 95.366 (4.634) 27.5 bicubic 288 (1.0 crop) efficientnet_b2a 80.608 (19.392) 95.310 (4.690) 9.11 bicubic 288 (1.0 crop) mixnet_xl 80.478 (19.522) 94.932 (5.068) 11.90 bicubic 224 efficientnet_b2 80.402 (19.598) 95.076 (4.924) 9.11 bicubic 260 skresnext50d_32x4d 80.156 (19.844) 94.642 (5.358) 27.5 bicubic 224 resnext50_32x4d 79.762 (20.238) 94.600 (5.400) 25 bicubic 224 resnext50d_32x4d 79.674 (20.326) 94.868 (5.132) 25.1 bicubic 224 ese_vovnet39b 79.320 (20.680) 94.710 (5.290) 24.6 bicubic 224 resnetblur50 79.290 (20.710) 94.632 (5.368) 25.6 bicubic 224 resnet50 79.038 (20.962) 94.390 (5.610) 25.6 bicubic 224 mixnet_l 78.976 (21.024 94.184 (5.816) 7.33 bicubic 224 efficientnet_b1 78.692 (21.308) 94.086 (5.914) 7.79 bicubic 240 efficientnet_es 78.066 (21.934) 93.926 (6.074) 5.44 bicubic 224 seresnext26t_32x4d 77.998 (22.002) 93.708 (6.292) 16.8 bicubic 224 seresnext26tn_32x4d 77.986 (22.014) 93.746 (6.254) 16.8 bicubic 224 efficientnet_b0 77.698 (22.302) 93.532 (6.468) 5.29 bicubic 224 seresnext26d_32x4d 77.602 (22.398) 93.608 (6.392) 16.8 bicubic 224 mobilenetv2_120d 77.294 (22.706 93.502 (6.498) 5.8 bicubic 224 mixnet_m 77.256 (22.744) 93.418 (6.582) 5.01 bicubic 224 seresnext26_32x4d 77.104 (22.896) 93.316 (6.684) 16.8 bicubic 224 skresnet34 76.912 (23.088) 93.322 (6.678) 22.2 bicubic 224 ese_vovnet19b_dw 76.798 (23.202) 93.268 (6.732) 6.5 bicubic 224 resnet26d 76.68 (23.32) 93.166 (6.834) 16 bicubic 224 densenetblur121d 76.576 (23.424) 93.190 (6.810) 8.0 bicubic 224 mobilenetv2_140 76.524 (23.476) 92.990 (7.010) 6.1 bicubic 224 mixnet_s 75.988 (24.012) 92.794 (7.206) 4.13 bicubic 224 mobilenetv3_large_100 75.766 (24.234) 92.542 (7.458) 5.5 bicubic 224 mobilenetv3_rw 75.634 (24.366) 92.708 (7.292) 5.5 bicubic 224 mnasnet_a1 75.448 (24.552) 92.604 (7.396) 3.89 bicubic 224 resnet26 75.292 (24.708) 92.57 (7.43) 16 bicubic 224 fbnetc_100 75.124 (24.876) 92.386 (7.614) 5.6 bilinear 224 resnet34 75.110 (24.890) 92.284 (7.716) 22 bilinear 224 mobilenetv2_110d 75.052 (24.948) 92.180 (7.820) 4.5 bicubic 224 seresnet34 74.808 (25.192) 92.124 (7.876) 22 bilinear 224 mnasnet_b1 74.658 (25.342) 92.114 (7.886) 4.38 bicubic 224 spnasnet_100 74.084 (25.916) 91.818 (8.182) 4.42 bilinear 224 skresnet18 73.038 (26.962) 91.168 (8.832) 11.9 bicubic 224 mobilenetv2_100 72.978 (27.022) 91.016 (8.984) 3.5 bicubic 224 seresnet18 71.742 (28.258) 90.334 (9.666) 11.8 bicubic 224","title":"Self-trained Weights"},{"location":"results/#ported-weights","text":"For the models below, the model code and weight porting from Tensorflow or MXNet Gluon to Pytorch was done by myself. There are weights/models ported by others included in this repository, they are not listed below. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size tf_efficientnet_l2_ns *tfp 88.352 (11.648) 98.652 (1.348) 480 bicubic 800 tf_efficientnet_l2_ns TBD TBD 480 bicubic 800 tf_efficientnet_l2_ns_475 88.234 (11.766) 98.546 (1.454)f 480 bicubic 475 tf_efficientnet_l2_ns_475 *tfp 88.172 (11.828) 98.566 (1.434) 480 bicubic 475 tf_efficientnet_b7_ns *tfp 86.844 (13.156) 98.084 (1.916) 66.35 bicubic 600 tf_efficientnet_b7_ns 86.840 (13.160) 98.094 (1.906) 66.35 bicubic 600 tf_efficientnet_b6_ns 86.452 (13.548) 97.882 (2.118) 43.04 bicubic 528 tf_efficientnet_b6_ns *tfp 86.444 (13.556) 97.880 (2.120) 43.04 bicubic 528 tf_efficientnet_b5_ns *tfp 86.064 (13.936) 97.746 (2.254) 30.39 bicubic 456 tf_efficientnet_b5_ns 86.088 (13.912) 97.752 (2.248) 30.39 bicubic 456 tf_efficientnet_b8_ap *tfp 85.436 (14.564) 97.272 (2.728) 87.4 bicubic 672 tf_efficientnet_b8 *tfp 85.384 (14.616) 97.394 (2.606) 87.4 bicubic 672 tf_efficientnet_b8 85.370 (14.630) 97.390 (2.610) 87.4 bicubic 672 tf_efficientnet_b8_ap 85.368 (14.632) 97.294 (2.706) 87.4 bicubic 672 tf_efficientnet_b4_ns *tfp 85.298 (14.702) 97.504 (2.496) 19.34 bicubic 380 tf_efficientnet_b4_ns 85.162 (14.838) 97.470 (2.530) 19.34 bicubic 380 tf_efficientnet_b7_ap *tfp 85.154 (14.846) 97.244 (2.756) 66.35 bicubic 600 tf_efficientnet_b7_ap 85.118 (14.882) 97.252 (2.748) 66.35 bicubic 600 tf_efficientnet_b7 *tfp 84.940 (15.060) 97.214 (2.786) 66.35 bicubic 600 tf_efficientnet_b7 84.932 (15.068) 97.208 (2.792) 66.35 bicubic 600 tf_efficientnet_b6_ap 84.786 (15.214) 97.138 (2.862) 43.04 bicubic 528 tf_efficientnet_b6_ap *tfp 84.760 (15.240) 97.124 (2.876) 43.04 bicubic 528 tf_efficientnet_b5_ap *tfp 84.276 (15.724) 96.932 (3.068) 30.39 bicubic 456 tf_efficientnet_b5_ap 84.254 (15.746) 96.976 (3.024) 30.39 bicubic 456 tf_efficientnet_b6 *tfp 84.140 (15.860) 96.852 (3.148) 43.04 bicubic 528 tf_efficientnet_b6 84.110 (15.890) 96.886 (3.114) 43.04 bicubic 528 tf_efficientnet_b3_ns *tfp 84.054 (15.946) 96.918 (3.082) 12.23 bicubic 300 tf_efficientnet_b3_ns 84.048 (15.952) 96.910 (3.090) 12.23 bicubic 300 tf_efficientnet_b5 *tfp 83.822 (16.178) 96.756 (3.244) 30.39 bicubic 456 tf_efficientnet_b5 83.812 (16.188) 96.748 (3.252) 30.39 bicubic 456 tf_efficientnet_b4_ap *tfp 83.278 (16.722) 96.376 (3.624) 19.34 bicubic 380 tf_efficientnet_b4_ap 83.248 (16.752) 96.388 (3.612) 19.34 bicubic 380 tf_efficientnet_b4 83.022 (16.978) 96.300 (3.700) 19.34 bicubic 380 tf_efficientnet_b4 *tfp 82.948 (17.052) 96.308 (3.692) 19.34 bicubic 380 tf_efficientnet_b2_ns *tfp 82.436 (17.564) 96.268 (3.732) 9.11 bicubic 260 tf_efficientnet_b2_ns 82.380 (17.620) 96.248 (3.752) 9.11 bicubic 260 tf_efficientnet_b3_ap *tfp 81.882 (18.118) 95.662 (4.338) 12.23 bicubic 300 tf_efficientnet_b3_ap 81.828 (18.172) 95.624 (4.376) 12.23 bicubic 300 tf_efficientnet_b3 81.636 (18.364) 95.718 (4.282) 12.23 bicubic 300 tf_efficientnet_b3 *tfp 81.576 (18.424) 95.662 (4.338) 12.23 bicubic 300 tf_efficientnet_lite4 81.528 (18.472) 95.668 (4.332) 13.00 bilinear 380 tf_efficientnet_b1_ns *tfp 81.514 (18.486) 95.776 (4.224) 7.79 bicubic 240 tf_efficientnet_lite4 *tfp 81.502 (18.498) 95.676 (4.324) 13.00 bilinear 380 tf_efficientnet_b1_ns 81.388 (18.612) 95.738 (4.262) 7.79 bicubic 240 gluon_senet154 81.224 (18.776) 95.356 (4.644) 115.09 bicubic 224 gluon_resnet152_v1s 81.012 (18.988) 95.416 (4.584) 60.32 bicubic 224 gluon_seresnext101_32x4d 80.902 (19.098) 95.294 (4.706) 48.96 bicubic 224 gluon_seresnext101_64x4d 80.890 (19.110) 95.304 (4.696) 88.23 bicubic 224 gluon_resnext101_64x4d 80.602 (19.398) 94.994 (5.006) 83.46 bicubic 224 tf_efficientnet_el 80.534 (19.466) 95.190 (4.810) 10.59 bicubic 300 tf_efficientnet_el *tfp 80.476 (19.524) 95.200 (4.800) 10.59 bicubic 300 gluon_resnet152_v1d 80.470 (19.530) 95.206 (4.794) 60.21 bicubic 224 gluon_resnet101_v1d 80.424 (19.576) 95.020 (4.980) 44.57 bicubic 224 tf_efficientnet_b2_ap *tfp 80.420 (19.580) 95.040 (4.960) 9.11 bicubic 260 gluon_resnext101_32x4d 80.334 (19.666) 94.926 (5.074) 44.18 bicubic 224 tf_efficientnet_b2_ap 80.306 (19.694) 95.028 (4.972) 9.11 bicubic 260 gluon_resnet101_v1s 80.300 (19.700) 95.150 (4.850) 44.67 bicubic 224 tf_efficientnet_b2 *tfp 80.188 (19.812) 94.974 (5.026) 9.11 bicubic 260 tf_efficientnet_b2 80.086 (19.914) 94.908 (5.092) 9.11 bicubic 260 gluon_resnet152_v1c 79.916 (20.084) 94.842 (5.158) 60.21 bicubic 224 gluon_seresnext50_32x4d 79.912 (20.088) 94.818 (5.182) 27.56 bicubic 224 tf_efficientnet_lite3 79.812 (20.188) 94.914 (5.086) 8.20 bilinear 300 tf_efficientnet_lite3 *tfp 79.734 (20.266) 94.838 (5.162) 8.20 bilinear 300 gluon_resnet152_v1b 79.692 (20.308) 94.738 (5.262) 60.19 bicubic 224 gluon_xception65 79.604 (20.396) 94.748 (5.252) 39.92 bicubic 299 gluon_resnet101_v1c 79.544 (20.456) 94.586 (5.414) 44.57 bicubic 224 tf_efficientnet_b1_ap *tfp 79.532 (20.468) 94.378 (5.622) 7.79 bicubic 240 tf_efficientnet_cc_b1_8e *tfp 79.464 (20.536) 94.492 (5.508) 39.7 bicubic 240 gluon_resnext50_32x4d 79.356 (20.644) 94.424 (5.576) 25.03 bicubic 224 gluon_resnet101_v1b 79.304 (20.696) 94.524 (5.476) 44.55 bicubic 224 tf_efficientnet_cc_b1_8e 79.298 (20.702) 94.364 (5.636) 39.7 bicubic 240 tf_efficientnet_b1_ap 79.278 (20.722) 94.308 (5.692) 7.79 bicubic 240 tf_efficientnet_b1 *tfp 79.172 (20.828) 94.450 (5.550) 7.79 bicubic 240 gluon_resnet50_v1d 79.074 (20.926) 94.476 (5.524) 25.58 bicubic 224 tf_efficientnet_em *tfp 78.958 (21.042) 94.458 (5.542) 6.90 bicubic 240 tf_mixnet_l *tfp 78.846 (21.154) 94.212 (5.788) 7.33 bilinear 224 tf_efficientnet_b1 78.826 (21.174) 94.198 (5.802) 7.79 bicubic 240 tf_efficientnet_b0_ns *tfp 78.806 (21.194) 94.496 (5.504) 5.29 bicubic 224 gluon_inception_v3 78.804 (21.196) 94.380 (5.620) 27.16M bicubic 299 tf_mixnet_l 78.770 (21.230) 94.004 (5.996) 7.33 bicubic 224 tf_efficientnet_em 78.742 (21.258) 94.332 (5.668) 6.90 bicubic 240 gluon_resnet50_v1s 78.712 (21.288) 94.242 (5.758) 25.68 bicubic 224 tf_efficientnet_b0_ns 78.658 (21.342) 94.376 (5.624) 5.29 bicubic 224 tf_efficientnet_cc_b0_8e *tfp 78.314 (21.686) 93.790 (6.210) 24.0 bicubic 224 gluon_resnet50_v1c 78.010 (21.990) 93.988 (6.012) 25.58 bicubic 224 tf_efficientnet_cc_b0_8e 77.908 (22.092) 93.656 (6.344) 24.0 bicubic 224 tf_inception_v3 77.856 (22.144) 93.644 (6.356) 27.16M bicubic 299 tf_efficientnet_cc_b0_4e *tfp 77.746 (22.254) 93.552 (6.448) 13.3 bicubic 224 tf_efficientnet_es *tfp 77.616 (22.384) 93.750 (6.250) 5.44 bicubic 224 gluon_resnet50_v1b 77.578 (22.422) 93.718 (6.282) 25.56 bicubic 224 adv_inception_v3 77.576 (22.424) 93.724 (6.276) 27.16M bicubic 299 tf_efficientnet_lite2 *tfp 77.544 (22.456) 93.800 (6.200) 6.09 bilinear 260 tf_efficientnet_lite2 77.460 (22.540) 93.746 (6.254) 6.09 bicubic 260 tf_efficientnet_b0_ap *tfp 77.514 (22.486) 93.576 (6.424) 5.29 bicubic 224 tf_efficientnet_cc_b0_4e 77.304 (22.696) 93.332 (6.668) 13.3 bicubic 224 tf_efficientnet_es 77.264 (22.736) 93.600 (6.400) 5.44 bicubic 224 tf_efficientnet_b0 *tfp 77.258 (22.742) 93.478 (6.522) 5.29 bicubic 224 tf_efficientnet_b0_ap 77.084 (22.916) 93.254 (6.746) 5.29 bicubic 224 tf_mixnet_m *tfp 77.072 (22.928) 93.368 (6.632) 5.01 bilinear 224 tf_mixnet_m 76.950 (23.050) 93.156 (6.844) 5.01 bicubic 224 tf_efficientnet_b0 76.848 (23.152) 93.228 (6.772) 5.29 bicubic 224 tf_efficientnet_lite1 *tfp 76.764 (23.236) 93.326 (6.674) 5.42 bilinear 240 tf_efficientnet_lite1 76.638 (23.362) 93.232 (6.768) 5.42 bicubic 240 tf_mixnet_s *tfp 75.800 (24.200) 92.788 (7.212) 4.13 bilinear 224 tf_mobilenetv3_large_100 *tfp 75.768 (24.232) 92.710 (7.290) 5.48 bilinear 224 tf_mixnet_s 75.648 (24.352) 92.636 (7.364) 4.13 bicubic 224 tf_mobilenetv3_large_100 75.516 (24.484) 92.600 (7.400) 5.48 bilinear 224 tf_efficientnet_lite0 *tfp 75.074 (24.926) 92.314 (7.686) 4.65 bilinear 224 tf_efficientnet_lite0 74.842 (25.158) 92.170 (7.830) 4.65 bicubic 224 gluon_resnet34_v1b 74.580 (25.420) 91.988 (8.012) 21.80 bicubic 224 tf_mobilenetv3_large_075 *tfp 73.730 (26.270) 91.616 (8.384) 3.99 bilinear 224 tf_mobilenetv3_large_075 73.442 (26.558) 91.352 (8.648) 3.99 bilinear 224 tf_mobilenetv3_large_minimal_100 *tfp 72.678 (27.322) 90.860 (9.140) 3.92 bilinear 224 tf_mobilenetv3_large_minimal_100 72.244 (27.756) 90.636 (9.364) 3.92 bilinear 224 tf_mobilenetv3_small_100 *tfp 67.918 (32.082) 87.958 (12.042 2.54 bilinear 224 tf_mobilenetv3_small_100 67.918 (32.082) 87.662 (12.338) 2.54 bilinear 224 tf_mobilenetv3_small_075 *tfp 66.142 (33.858) 86.498 (13.502) 2.04 bilinear 224 tf_mobilenetv3_small_075 65.718 (34.282) 86.136 (13.864) 2.04 bilinear 224 tf_mobilenetv3_small_minimal_100 *tfp 63.378 (36.622) 84.802 (15.198) 2.04 bilinear 224 tf_mobilenetv3_small_minimal_100 62.898 (37.102) 84.230 (15.770) 2.04 bilinear 224 Models with *tfp next to them were scored with --tf-preprocessing flag. The tf_efficientnet , tf_mixnet models require an equivalent for 'SAME' padding as their arch results in asymmetric padding. I've added this in the model creation wrapper, but it does come with a performance penalty. Sources for original weights: * tf_efficientnet* : Tensorflow TPU * tf_efficientnet_e* : Tensorflow TPU * tf_mixnet* : Tensorflow TPU * tf_inception* : Tensorflow Slim * gluon_* : MxNet Gluon","title":"Ported Weights"},{"location":"scripts/","text":"Scripts A train, validation, inference, and checkpoint cleaning script included in the github root folder. Scripts are not currently packaged in the pip release. The training and validation scripts evolved from early versions of the PyTorch Imagenet Examples . I have added significant functionality over time, including CUDA specific performance enhancements based on NVIDIA's APEX Examples . Training Script The variety of training args is large and not all combinations of options (or even options) have been fully tested. For the training dataset folder, specify the folder to the base that contains a train and validation folder. To train an SE-ResNet34 on ImageNet, locally distributed, 4 GPUs, one process per GPU w/ cosine schedule, random-erasing prob of 50% and per-pixel random value: ./distributed_train.sh 4 /data/imagenet --model seresnet34 --sched cosine --epochs 150 --warmup-epochs 5 --lr 0.4 --reprob 0.5 --remode pixel --batch-size 256 -j 4 NOTE: NVIDIA APEX should be installed to run in per-process distributed via DDP or to enable AMP mixed precision with the --amp flag Validation / Inference Scripts Validation and inference scripts are similar in usage. One outputs metrics on a validation set and the other outputs topk class ids in a csv. Specify the folder containing validation images, not the base as in training script. To validate with the model's pretrained weights (if they exist): python validate.py /imagenet/validation/ --model seresnext26_32x4d --pretrained To run inference from a checkpoint: python inference.py /imagenet/validation/ --model mobilenetv3_large_100 --checkpoint ./output/model_best.pth.tar","title":"Scripts"},{"location":"scripts/#scripts","text":"A train, validation, inference, and checkpoint cleaning script included in the github root folder. Scripts are not currently packaged in the pip release. The training and validation scripts evolved from early versions of the PyTorch Imagenet Examples . I have added significant functionality over time, including CUDA specific performance enhancements based on NVIDIA's APEX Examples .","title":"Scripts"},{"location":"scripts/#training-script","text":"The variety of training args is large and not all combinations of options (or even options) have been fully tested. For the training dataset folder, specify the folder to the base that contains a train and validation folder. To train an SE-ResNet34 on ImageNet, locally distributed, 4 GPUs, one process per GPU w/ cosine schedule, random-erasing prob of 50% and per-pixel random value: ./distributed_train.sh 4 /data/imagenet --model seresnet34 --sched cosine --epochs 150 --warmup-epochs 5 --lr 0.4 --reprob 0.5 --remode pixel --batch-size 256 -j 4 NOTE: NVIDIA APEX should be installed to run in per-process distributed via DDP or to enable AMP mixed precision with the --amp flag","title":"Training Script"},{"location":"scripts/#validation-inference-scripts","text":"Validation and inference scripts are similar in usage. One outputs metrics on a validation set and the other outputs topk class ids in a csv. Specify the folder containing validation images, not the base as in training script. To validate with the model's pretrained weights (if they exist): python validate.py /imagenet/validation/ --model seresnext26_32x4d --pretrained To run inference from a checkpoint: python inference.py /imagenet/validation/ --model mobilenetv3_large_100 --checkpoint ./output/model_best.pth.tar","title":"Validation / Inference Scripts"},{"location":"training_hparam_examples/","text":"Training Examples EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5 These params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016 MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5 This params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce SE-ResNeXt-26-D and SE-ResNeXt-26-T These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards: ./distributed_train.sh 2 /imagenet/ --model seresnext26t_32x4d --lr 0.1 --warmup-epochs 5 --epochs 160 --weight-decay 1e-4 --sched cosine --reprob 0.4 --remode pixel -b 112 EfficientNet-B3 with RandAugment - 81.5 top-1, 95.7 top-5 The training of this model started with the same command line as EfficientNet-B2 w/ RA above. After almost three weeks of training the process crashed. The results weren't looking amazing so I resumed the training several times with tweaks to a few params (increase RE prob, decrease rand-aug, increase ema-decay). Nothing looked great. I ended up averaging the best checkpoints from all restarts. The result is mediocre at default res/crop but oddly performs much better with a full image test crop of 1.0. EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5 Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2. ./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048 ResNet50 with JSD loss and RandAugment (clean + 2x RA augs) - 79.04 top-1, 94.39 top-5 Trained on two older 1080Ti cards, this took a while. Only slightly, non statistically better ImageNet validation result than my first good AugMix training of 78.99. However, these weights are more robust on tests with ImageNetV2, ImageNet-Sketch, etc. Unlike my first AugMix runs, I've enabled SplitBatchNorm, disabled random erasing on the clean split, and cranked up random erasing prob on the 2 augmented paths. ./distributed_train.sh 2 /imagenet -b 64 --model resnet50 --sched cosine --epochs 200 --lr 0.05 --amp --remode pixel --reprob 0.6 --aug-splits 3 --aa rand-m9-mstd0.5-inc1 --resplit --split-bn --jsd --dist-bn reduce EfficientNet-ES (EdgeTPU-Small) with RandAugment - 78.066 top-1, 93.926 top-5 Trained by Andrew Lavin with 8 V100 cards. Model EMA was not used, final checkpoint is the average of 8 best checkpoints during training. ./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5 ./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9 ResNeXt-50 32x4d w/ RandAugment - 79.762 top-1, 94.60 top-5 These params will also work well for SE-ResNeXt-50 and SK-ResNeXt-50 and likely 101. I used them for the SK-ResNeXt-50 32x4d that I trained with 2 GPU using a slightly higher LR per effective batch size (lr=0.18, b=192 per GPU). The cmd line below are tuned for 8 GPU training. ./distributed_train.sh 8 /imagenet --model resnext50_32x4d --lr 0.6 --warmup-epochs 5 --epochs 240 --weight-decay 1e-4 --sched cosine --reprob 0.4 --recount 3 --remode pixel --aa rand-m7-mstd0.5-inc1 -b 192 -j 6 --amp --dist-bn reduce","title":"Training Examples"},{"location":"training_hparam_examples/#training-examples","text":"","title":"Training Examples"},{"location":"training_hparam_examples/#efficientnet-b2-with-randaugment-804-top-1-951-top-5","text":"These params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016","title":"EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5"},{"location":"training_hparam_examples/#mixnet-xl-with-randaugment-805-top-1-949-top-5","text":"This params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce","title":"MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5"},{"location":"training_hparam_examples/#se-resnext-26-d-and-se-resnext-26-t","text":"These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards: ./distributed_train.sh 2 /imagenet/ --model seresnext26t_32x4d --lr 0.1 --warmup-epochs 5 --epochs 160 --weight-decay 1e-4 --sched cosine --reprob 0.4 --remode pixel -b 112","title":"SE-ResNeXt-26-D and SE-ResNeXt-26-T"},{"location":"training_hparam_examples/#efficientnet-b3-with-randaugment-815-top-1-957-top-5","text":"The training of this model started with the same command line as EfficientNet-B2 w/ RA above. After almost three weeks of training the process crashed. The results weren't looking amazing so I resumed the training several times with tweaks to a few params (increase RE prob, decrease rand-aug, increase ema-decay). Nothing looked great. I ended up averaging the best checkpoints from all restarts. The result is mediocre at default res/crop but oddly performs much better with a full image test crop of 1.0.","title":"EfficientNet-B3 with RandAugment - 81.5 top-1, 95.7 top-5"},{"location":"training_hparam_examples/#efficientnet-b0-with-randaugment-777-top-1-953-top-5","text":"Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2. ./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048","title":"EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5"},{"location":"training_hparam_examples/#resnet50-with-jsd-loss-and-randaugment-clean-2x-ra-augs-7904-top-1-9439-top-5","text":"Trained on two older 1080Ti cards, this took a while. Only slightly, non statistically better ImageNet validation result than my first good AugMix training of 78.99. However, these weights are more robust on tests with ImageNetV2, ImageNet-Sketch, etc. Unlike my first AugMix runs, I've enabled SplitBatchNorm, disabled random erasing on the clean split, and cranked up random erasing prob on the 2 augmented paths. ./distributed_train.sh 2 /imagenet -b 64 --model resnet50 --sched cosine --epochs 200 --lr 0.05 --amp --remode pixel --reprob 0.6 --aug-splits 3 --aa rand-m9-mstd0.5-inc1 --resplit --split-bn --jsd --dist-bn reduce","title":"ResNet50 with JSD loss and RandAugment (clean + 2x RA augs) - 79.04 top-1, 94.39 top-5"},{"location":"training_hparam_examples/#efficientnet-es-edgetpu-small-with-randaugment-78066-top-1-93926-top-5","text":"Trained by Andrew Lavin with 8 V100 cards. Model EMA was not used, final checkpoint is the average of 8 best checkpoints during training. ./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064","title":"EfficientNet-ES (EdgeTPU-Small) with RandAugment - 78.066 top-1, 93.926 top-5"},{"location":"training_hparam_examples/#mobilenetv3-large-100-75766-top-1-92542-top-5","text":"./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9","title":"MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5"},{"location":"training_hparam_examples/#resnext-50-32x4d-w-randaugment-79762-top-1-9460-top-5","text":"These params will also work well for SE-ResNeXt-50 and SK-ResNeXt-50 and likely 101. I used them for the SK-ResNeXt-50 32x4d that I trained with 2 GPU using a slightly higher LR per effective batch size (lr=0.18, b=192 per GPU). The cmd line below are tuned for 8 GPU training. ./distributed_train.sh 8 /imagenet --model resnext50_32x4d --lr 0.6 --warmup-epochs 5 --epochs 240 --weight-decay 1e-4 --sched cosine --reprob 0.4 --recount 3 --remode pixel --aa rand-m7-mstd0.5-inc1 -b 192 -j 6 --amp --dist-bn reduce","title":"ResNeXt-50 32x4d w/ RandAugment - 79.762 top-1, 94.60 top-5"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Getting Started Install The library can be installed with pip: pip install timm Conda Environment All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x., 3.8.x. Little to no care has been taken to be Python 2.x friendly and will not support it. If you run into any challenges running on Windows, or other OS, I'm definitely open to looking into those issues so long as it's in a reproducible (read Conda) environment. PyTorch versions 1.4, 1.5.x, 1.6, and 1.7 have been tested with this code. I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda: conda create -n torch-env conda activate torch-env conda install -c pytorch pytorch torchvision cudatoolkit=11 conda install pyyaml Load a Pretrained Model Pretrained models can be loaded using timm.create_model import timm m = timm . create_model ( 'mobilenetv3_large_100' , pretrained = True ) m . eval () List Models with Pretrained Weights import timm from pprint import pprint model_names = timm . list_models ( pretrained = True ) pprint ( model_names ) >>> [ 'adv_inception_v3' , 'cspdarknet53' , 'cspresnext50' , 'densenet121' , 'densenet161' , 'densenet169' , 'densenet201' , 'densenetblur121d' , 'dla34' , 'dla46_c' , ... ] List Model Architectures by Wildcard import timm from pprint import pprint model_names = timm . list_models ( '*resne*t*' ) pprint ( model_names ) >>> [ 'cspresnet50' , 'cspresnet50d' , 'cspresnet50w' , 'cspresnext50' , ... ]","title":"Getting Started"},{"location":"#getting-started","text":"","title":"Getting Started"},{"location":"#install","text":"The library can be installed with pip: pip install timm Conda Environment All development and testing has been done in Conda Python 3 environments on Linux x86-64 systems, specifically Python 3.6.x, 3.7.x., 3.8.x. Little to no care has been taken to be Python 2.x friendly and will not support it. If you run into any challenges running on Windows, or other OS, I'm definitely open to looking into those issues so long as it's in a reproducible (read Conda) environment. PyTorch versions 1.4, 1.5.x, 1.6, and 1.7 have been tested with this code. I've tried to keep the dependencies minimal, the setup is as per the PyTorch default install instructions for Conda: conda create -n torch-env conda activate torch-env conda install -c pytorch pytorch torchvision cudatoolkit=11 conda install pyyaml","title":"Install"},{"location":"#load-a-pretrained-model","text":"Pretrained models can be loaded using timm.create_model import timm m = timm . create_model ( 'mobilenetv3_large_100' , pretrained = True ) m . eval ()","title":"Load a Pretrained Model"},{"location":"#list-models-with-pretrained-weights","text":"import timm from pprint import pprint model_names = timm . list_models ( pretrained = True ) pprint ( model_names ) >>> [ 'adv_inception_v3' , 'cspdarknet53' , 'cspresnext50' , 'densenet121' , 'densenet161' , 'densenet169' , 'densenet201' , 'densenetblur121d' , 'dla34' , 'dla46_c' , ... ]","title":"List Models with Pretrained Weights"},{"location":"#list-model-architectures-by-wildcard","text":"import timm from pprint import pprint model_names = timm . list_models ( '*resne*t*' ) pprint ( model_names ) >>> [ 'cspresnet50' , 'cspresnet50d' , 'cspresnet50w' , 'cspresnext50' , ... ]","title":"List Model Architectures by Wildcard"},{"location":"archived_changes/","text":"Archived Changes April 5, 2020 Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite 3.5M param MobileNet-V2 100 @ 73% 4.5M param MobileNet-V2 110d @ 75% 6.1M param MobileNet-V2 140 @ 76.5% 5.8M param MobileNet-V2 120d @ 77.3% March 18, 2020 Add EfficientNet-Lite models w/ weights ported from Tensorflow TPU Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by Andrew Lavin (see Training section for hparams) Feb 29, 2020 New MobileNet-V3 Large weights trained from stratch with this code to 75.77% top-1 IMPORTANT CHANGE - default weight init changed for all MobilenetV3 / EfficientNet / related models overall results similar to a bit better training from scratch on a few smaller models tried performance early in training seems consistently improved but less difference by end set fix_group_fanout=False in _init_weight_goog fn if you need to reproducte past behaviour Experimental LR noise feature added applies a random perturbation to LR each epoch in specified range of training Feb 18, 2020 Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' ( https://arxiv.org/abs/2001.06268 ): Move layer/module impl into layers subfolder/module of models and organize in a more granular fashion ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks Add Selective Kernel Nets on top of ResNet base, pretrained weights skresnet18 - 73% top-1 skresnet34 - 76.9% top-1 skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1 ECA and CECA (circular padding) attention layer contributed by Chris Ha CBAM attention experiment (not the best results so far, may remove) Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the .se position for all ResNets Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights Feb 12, 2020 Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from Tensorflow TPU Feb 6, 2020 Add RandAugment trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by Andrew Lavin (see Training section for hparams) Feb \u00bd, 2020 Port new EfficientNet-B8 (RandAugment) weights, these are different than the B8 AdvProp, different input normalization. Update results csv files on all models for ImageNet validation and three other test sets Push PyPi package update Jan 31, 2020 Update ResNet50 weights with a new 79.038 result from further JSD / AugMix experiments. Full command line for reproduction in training section below. Jan 11/12, 2020 Master may be a bit unstable wrt to training, these changes have been tested but not all combos Implementations of AugMix added to existing RA and AA. Including numerous supporting pieces like JSD loss (Jensen-Shannon divergence + CE), and AugMixDataset SplitBatchNorm adaptation layer added for implementing Auxiliary BN as per AdvProp paper ResNet-50 AugMix trained model w/ 79% top-1 added seresnext26tn_32x4d - 77.99 top-1, 93.75 top-5 added to tiered experiment, higher img/s than 't' and 'd' Jan 3, 2020 Add RandAugment trained EfficientNet-B0 weight with 77.7 top-1. Trained by Michael Klachko with this code and recent hparams (see Training section) Add avg_checkpoints.py script for post training weight averaging and update all scripts with header docstrings and shebangs. Dec 30, 2019 Merge Dushyant Mehta's PR for SelecSLS (Selective Short and Long Range Skip Connections) networks. Good GPU memory consumption and throughput. Original: https://github.com/mehtadushy/SelecSLS-Pytorch Dec 28, 2019 Add new model weights and training hparams (see Training Hparams section) efficientnet_b3 - 81.5 top-1, 95.7 top-5 at default res/crop, 81.9, 95.8 at 320x320 1.0 crop-pct trained with RandAugment, ended up with an interesting but less than perfect result (see training section) seresnext26d_32x4d - 77.6 top-1, 93.6 top-5 deep stem (32, 32, 64), avgpool downsample stem/dowsample from bag-of-tricks paper seresnext26t_32x4d - 78.0 top-1, 93.7 top-5 deep tiered stem (24, 48, 64), avgpool downsample (a modified 'D' variant) stem sizing mods from Jeremy Howard and fastai devs discussing ResNet architecture experiments Dec 23, 2019 Add RandAugment trained MixNet-XL weights with 80.48 top-1. --dist-bn argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval Dec 4, 2019 Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5). Nov 29, 2019 Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded. AdvProp weights added Official TF MobileNetv3 weights added EfficientNet and MobileNetV3 hook based 'feature extraction' classes added. Will serve as basis for using models as backbones in obj detection/segmentation tasks. Lots more to be done here... HRNet classification models and weights added from https://github.com/HRNet/HRNet-Image-Classification Consistency in global pooling, reset_classifer , and forward_features across models forward_features always returns unpooled feature maps now Reasonable chance I broke something... let me know Nov 22, 2019 Add ImageNet training RandAugment implementation alongside AutoAugment. PyTorch Transform compatible format, using PIL. Currently training two EfficientNet models from scratch with promising results... will update. drop-connect cmd line arg finally added to train.py , no need to hack model fns. Works for efficientnet/mobilenetv3 based models, ignored otherwise.","title":"Archived Changes"},{"location":"archived_changes/#archived-changes","text":"","title":"Archived Changes"},{"location":"archived_changes/#april-5-2020","text":"Add some newly trained MobileNet-V2 models trained with latest h-params, rand augment. They compare quite favourably to EfficientNet-Lite 3.5M param MobileNet-V2 100 @ 73% 4.5M param MobileNet-V2 110d @ 75% 6.1M param MobileNet-V2 140 @ 76.5% 5.8M param MobileNet-V2 120d @ 77.3%","title":"April 5, 2020"},{"location":"archived_changes/#march-18-2020","text":"Add EfficientNet-Lite models w/ weights ported from Tensorflow TPU Add RandAugment trained ResNeXt-50 32x4d weights with 79.8 top-1. Trained by Andrew Lavin (see Training section for hparams)","title":"March 18, 2020"},{"location":"archived_changes/#feb-29-2020","text":"New MobileNet-V3 Large weights trained from stratch with this code to 75.77% top-1 IMPORTANT CHANGE - default weight init changed for all MobilenetV3 / EfficientNet / related models overall results similar to a bit better training from scratch on a few smaller models tried performance early in training seems consistently improved but less difference by end set fix_group_fanout=False in _init_weight_goog fn if you need to reproducte past behaviour Experimental LR noise feature added applies a random perturbation to LR each epoch in specified range of training","title":"Feb 29, 2020"},{"location":"archived_changes/#feb-18-2020","text":"Big refactor of model layers and addition of several attention mechanisms. Several additions motivated by 'Compounding the Performance Improvements...' ( https://arxiv.org/abs/2001.06268 ): Move layer/module impl into layers subfolder/module of models and organize in a more granular fashion ResNet downsample paths now properly support dilation (output stride != 32) for avg_pool ('D' variant) and 3x3 (SENets) networks Add Selective Kernel Nets on top of ResNet base, pretrained weights skresnet18 - 73% top-1 skresnet34 - 76.9% top-1 skresnext50_32x4d (equiv to SKNet50) - 80.2% top-1 ECA and CECA (circular padding) attention layer contributed by Chris Ha CBAM attention experiment (not the best results so far, may remove) Attention factory to allow dynamically selecting one of SE, ECA, CBAM in the .se position for all ResNets Add DropBlock and DropPath (formerly DropConnect for EfficientNet/MobileNetv3) support to all ResNet variants Full dataset results updated that incl NoisyStudent weights and 2 of the 3 SK weights","title":"Feb 18, 2020"},{"location":"archived_changes/#feb-12-2020","text":"Add EfficientNet-L2 and B0-B7 NoisyStudent weights ported from Tensorflow TPU","title":"Feb 12, 2020"},{"location":"archived_changes/#feb-6-2020","text":"Add RandAugment trained EfficientNet-ES (EdgeTPU-Small) weights with 78.1 top-1. Trained by Andrew Lavin (see Training section for hparams)","title":"Feb 6, 2020"},{"location":"archived_changes/#feb-12-2020_1","text":"Port new EfficientNet-B8 (RandAugment) weights, these are different than the B8 AdvProp, different input normalization. Update results csv files on all models for ImageNet validation and three other test sets Push PyPi package update","title":"Feb 1/2, 2020"},{"location":"archived_changes/#jan-31-2020","text":"Update ResNet50 weights with a new 79.038 result from further JSD / AugMix experiments. Full command line for reproduction in training section below.","title":"Jan 31, 2020"},{"location":"archived_changes/#jan-1112-2020","text":"Master may be a bit unstable wrt to training, these changes have been tested but not all combos Implementations of AugMix added to existing RA and AA. Including numerous supporting pieces like JSD loss (Jensen-Shannon divergence + CE), and AugMixDataset SplitBatchNorm adaptation layer added for implementing Auxiliary BN as per AdvProp paper ResNet-50 AugMix trained model w/ 79% top-1 added seresnext26tn_32x4d - 77.99 top-1, 93.75 top-5 added to tiered experiment, higher img/s than 't' and 'd'","title":"Jan 11/12, 2020"},{"location":"archived_changes/#jan-3-2020","text":"Add RandAugment trained EfficientNet-B0 weight with 77.7 top-1. Trained by Michael Klachko with this code and recent hparams (see Training section) Add avg_checkpoints.py script for post training weight averaging and update all scripts with header docstrings and shebangs.","title":"Jan 3, 2020"},{"location":"archived_changes/#dec-30-2019","text":"Merge Dushyant Mehta's PR for SelecSLS (Selective Short and Long Range Skip Connections) networks. Good GPU memory consumption and throughput. Original: https://github.com/mehtadushy/SelecSLS-Pytorch","title":"Dec 30, 2019"},{"location":"archived_changes/#dec-28-2019","text":"Add new model weights and training hparams (see Training Hparams section) efficientnet_b3 - 81.5 top-1, 95.7 top-5 at default res/crop, 81.9, 95.8 at 320x320 1.0 crop-pct trained with RandAugment, ended up with an interesting but less than perfect result (see training section) seresnext26d_32x4d - 77.6 top-1, 93.6 top-5 deep stem (32, 32, 64), avgpool downsample stem/dowsample from bag-of-tricks paper seresnext26t_32x4d - 78.0 top-1, 93.7 top-5 deep tiered stem (24, 48, 64), avgpool downsample (a modified 'D' variant) stem sizing mods from Jeremy Howard and fastai devs discussing ResNet architecture experiments","title":"Dec 28, 2019"},{"location":"archived_changes/#dec-23-2019","text":"Add RandAugment trained MixNet-XL weights with 80.48 top-1. --dist-bn argument added to train.py, will distribute BN stats between nodes after each train epoch, before eval","title":"Dec 23, 2019"},{"location":"archived_changes/#dec-4-2019","text":"Added weights from the first training from scratch of an EfficientNet (B2) with my new RandAugment implementation. Much better than my previous B2 and very close to the official AdvProp ones (80.4 top-1, 95.08 top-5).","title":"Dec 4, 2019"},{"location":"archived_changes/#nov-29-2019","text":"Brought EfficientNet and MobileNetV3 up to date with my https://github.com/rwightman/gen-efficientnet-pytorch code. Torchscript and ONNX export compat excluded. AdvProp weights added Official TF MobileNetv3 weights added EfficientNet and MobileNetV3 hook based 'feature extraction' classes added. Will serve as basis for using models as backbones in obj detection/segmentation tasks. Lots more to be done here... HRNet classification models and weights added from https://github.com/HRNet/HRNet-Image-Classification Consistency in global pooling, reset_classifer , and forward_features across models forward_features always returns unpooled feature maps now Reasonable chance I broke something... let me know","title":"Nov 29, 2019"},{"location":"archived_changes/#nov-22-2019","text":"Add ImageNet training RandAugment implementation alongside AutoAugment. PyTorch Transform compatible format, using PIL. Currently training two EfficientNet models from scratch with promising results... will update. drop-connect cmd line arg finally added to train.py , no need to hack model fns. Works for efficientnet/mobilenetv3 based models, ignored otherwise.","title":"Nov 22, 2019"},{"location":"changes/","text":"Recent Changes Oct 30, 2020 Test with PyTorch 1.7 and fix a small top-n metric view vs reshape issue. Convert newly added 224x224 Vision Transformer weights from official JAX repo. 81.8 top-1 for B/16, 83.1 L/16. Support PyTorch 1.7 optimized, native SiLU (aka Swish) activation. Add mapping to 'silu' name, custom swish will eventually be deprecated. Fix regression for loading pretrained classifier via direct model entrypoint functions. Didn't impact create_model() factory usage. PyPi release @ 0.3.0 version! Oct 26, 2020 Update Vision Transformer models to be compatible with official code release at https://github.com/google-research/vision_transformer Add Vision Transformer weights (ImageNet-21k pretrain) for 384x384 base and large models converted from official jax impl ViT-B/16 - 84.2 ViT-B/32 - 81.7 ViT-L/16 - 85.2 ViT-L/32 - 81.5 Oct 21, 2020 Weights added for Vision Transformer (ViT) models. 77.86 top-1 for 'small' and 79.35 for 'base'. Thanks to Christof for training the base model w/ lots of GPUs. Oct 13, 2020 Initial impl of Vision Transformer models. Both patch and hybrid (CNN backbone) variants. Currently trying to train... Adafactor and AdaHessian (FP32 only, no AMP) optimizers EdgeTPU-M ( efficientnet_em ) model trained in PyTorch, 79.3 top-1 Pip release, doc updates pending a few more changes... Sept 18, 2020 New ResNet 'D' weights. 72.7 (top-1) ResNet-18-D, 77.1 ResNet-34-D, 80.5 ResNet-50-D Added a few untrained defs for other ResNet models (66D, 101D, 152D, 200/200D) Sept 3, 2020 New weights Wide-ResNet50 - 81.5 top-1 (vs 78.5 torchvision) SEResNeXt50-32x4d - 81.3 top-1 (vs 79.1 cadene) Support for native Torch AMP and channels_last memory format added to train/validate scripts ( --channels-last , --native-amp vs --apex-amp ) Models tested with channels_last on latest NGC 20.08 container. AdaptiveAvgPool in attn layers changed to mean((2,3)) to work around bug with NHWC kernel. Aug 12, 2020 New/updated weights from training experiments EfficientNet-B3 - 82.1 top-1 (vs 81.6 for official with AA and 81.9 for AdvProp) RegNetY-3.2GF - 82.0 top-1 (78.9 from official ver) CSPResNet50 - 79.6 top-1 (76.6 from official ver) Add CutMix integrated w/ Mixup. See pull request for some usage examples Some fixes for using pretrained weights with in_chans != 3 on several models. Aug 5, 2020 Universal feature extraction, new models, new weights, new test sets. All models support the features_only=True argument for create_model call to return a network that extracts features from the deepest layer at each stride. New models CSPResNet, CSPResNeXt, CSPDarkNet, DarkNet ReXNet (Modified Aligned) Xception41/65/71 (a proper port of TF models) New trained weights SEResNet50 - 80.3 top-1 CSPDarkNet53 - 80.1 top-1 CSPResNeXt50 - 80.0 top-1 DPN68b - 79.2 top-1 EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by @hal-314 ) Add 'real' labels for ImageNet and ImageNet-Renditions test set, see results/README.md Test set ranking/top-n diff script by @KushajveerSingh Train script and loader/transform tweaks to punch through more aug arguments README and documentation overhaul. See initial (WIP) documentation at https://rwightman.github.io/pytorch-image-models/ adamp and sgdp optimizers added by @hellbell June 11, 2020 Bunch of changes: DenseNet models updated with memory efficient addition from torchvision (fixed a bug), blur pooling and deep stem additions VoVNet V1 and V2 models added, 39 V2 variant (ese_vovnet_39b) trained to 79.3 top-1 Activation factory added along with new activations: select act at model creation time for more flexibility in using activations compatible with scripting or tracing (ONNX export) hard_mish (experimental) added with memory-efficient grad, along with ME hard_swish context mgr for setting exportable/scriptable/no_jit states Norm + Activation combo layers added with initial trial support in DenseNet and VoVNet along with impl of EvoNorm and InplaceAbn wrapper that fit the interface Torchscript works for all but two of the model types as long as using Pytorch 1.5+, tests added for this Some import cleanup and classifier reset changes, all models will have classifier reset to nn.Identity on reset_classifer(0) call Prep for 0.1.28 pip release May 12, 2020 Add ResNeSt models (code adapted from https://github.com/zhanghang1989/ResNeSt , paper https://arxiv.org/abs/2004.08955 )) May 3, 2020 Pruned EfficientNet B1, B2, and B3 ( https://arxiv.org/abs/2002.08258 ) contributed by Yonathan Aflalo May 1, 2020 Merged a number of execellent contributions in the ResNet model family over the past month BlurPool2D and resnetblur models initiated by Chris Ha , I trained resnetblur50 to 79.3. TResNet models and SpaceToDepth, AntiAliasDownsampleLayer layers by mrT23 ecaresnet (50d, 101d, light) models and two pruned variants using pruning as per ( https://arxiv.org/abs/2002.08258 ) by Yonathan Aflalo 200 pretrained models in total now with updated results csv in results folder","title":"Recent Changes"},{"location":"changes/#recent-changes","text":"","title":"Recent Changes"},{"location":"changes/#oct-30-2020","text":"Test with PyTorch 1.7 and fix a small top-n metric view vs reshape issue. Convert newly added 224x224 Vision Transformer weights from official JAX repo. 81.8 top-1 for B/16, 83.1 L/16. Support PyTorch 1.7 optimized, native SiLU (aka Swish) activation. Add mapping to 'silu' name, custom swish will eventually be deprecated. Fix regression for loading pretrained classifier via direct model entrypoint functions. Didn't impact create_model() factory usage. PyPi release @ 0.3.0 version!","title":"Oct 30, 2020"},{"location":"changes/#oct-26-2020","text":"Update Vision Transformer models to be compatible with official code release at https://github.com/google-research/vision_transformer Add Vision Transformer weights (ImageNet-21k pretrain) for 384x384 base and large models converted from official jax impl ViT-B/16 - 84.2 ViT-B/32 - 81.7 ViT-L/16 - 85.2 ViT-L/32 - 81.5","title":"Oct 26, 2020"},{"location":"changes/#oct-21-2020","text":"Weights added for Vision Transformer (ViT) models. 77.86 top-1 for 'small' and 79.35 for 'base'. Thanks to Christof for training the base model w/ lots of GPUs.","title":"Oct 21, 2020"},{"location":"changes/#oct-13-2020","text":"Initial impl of Vision Transformer models. Both patch and hybrid (CNN backbone) variants. Currently trying to train... Adafactor and AdaHessian (FP32 only, no AMP) optimizers EdgeTPU-M ( efficientnet_em ) model trained in PyTorch, 79.3 top-1 Pip release, doc updates pending a few more changes...","title":"Oct 13, 2020"},{"location":"changes/#sept-18-2020","text":"New ResNet 'D' weights. 72.7 (top-1) ResNet-18-D, 77.1 ResNet-34-D, 80.5 ResNet-50-D Added a few untrained defs for other ResNet models (66D, 101D, 152D, 200/200D)","title":"Sept 18, 2020"},{"location":"changes/#sept-3-2020","text":"New weights Wide-ResNet50 - 81.5 top-1 (vs 78.5 torchvision) SEResNeXt50-32x4d - 81.3 top-1 (vs 79.1 cadene) Support for native Torch AMP and channels_last memory format added to train/validate scripts ( --channels-last , --native-amp vs --apex-amp ) Models tested with channels_last on latest NGC 20.08 container. AdaptiveAvgPool in attn layers changed to mean((2,3)) to work around bug with NHWC kernel.","title":"Sept 3, 2020"},{"location":"changes/#aug-12-2020","text":"New/updated weights from training experiments EfficientNet-B3 - 82.1 top-1 (vs 81.6 for official with AA and 81.9 for AdvProp) RegNetY-3.2GF - 82.0 top-1 (78.9 from official ver) CSPResNet50 - 79.6 top-1 (76.6 from official ver) Add CutMix integrated w/ Mixup. See pull request for some usage examples Some fixes for using pretrained weights with in_chans != 3 on several models.","title":"Aug 12, 2020"},{"location":"changes/#aug-5-2020","text":"Universal feature extraction, new models, new weights, new test sets. All models support the features_only=True argument for create_model call to return a network that extracts features from the deepest layer at each stride. New models CSPResNet, CSPResNeXt, CSPDarkNet, DarkNet ReXNet (Modified Aligned) Xception41/65/71 (a proper port of TF models) New trained weights SEResNet50 - 80.3 top-1 CSPDarkNet53 - 80.1 top-1 CSPResNeXt50 - 80.0 top-1 DPN68b - 79.2 top-1 EfficientNet-Lite0 (non-TF ver) - 75.5 (submitted by @hal-314 ) Add 'real' labels for ImageNet and ImageNet-Renditions test set, see results/README.md Test set ranking/top-n diff script by @KushajveerSingh Train script and loader/transform tweaks to punch through more aug arguments README and documentation overhaul. See initial (WIP) documentation at https://rwightman.github.io/pytorch-image-models/ adamp and sgdp optimizers added by @hellbell","title":"Aug 5, 2020"},{"location":"changes/#june-11-2020","text":"Bunch of changes: DenseNet models updated with memory efficient addition from torchvision (fixed a bug), blur pooling and deep stem additions VoVNet V1 and V2 models added, 39 V2 variant (ese_vovnet_39b) trained to 79.3 top-1 Activation factory added along with new activations: select act at model creation time for more flexibility in using activations compatible with scripting or tracing (ONNX export) hard_mish (experimental) added with memory-efficient grad, along with ME hard_swish context mgr for setting exportable/scriptable/no_jit states Norm + Activation combo layers added with initial trial support in DenseNet and VoVNet along with impl of EvoNorm and InplaceAbn wrapper that fit the interface Torchscript works for all but two of the model types as long as using Pytorch 1.5+, tests added for this Some import cleanup and classifier reset changes, all models will have classifier reset to nn.Identity on reset_classifer(0) call Prep for 0.1.28 pip release","title":"June 11, 2020"},{"location":"changes/#may-12-2020","text":"Add ResNeSt models (code adapted from https://github.com/zhanghang1989/ResNeSt , paper https://arxiv.org/abs/2004.08955 ))","title":"May 12, 2020"},{"location":"changes/#may-3-2020","text":"Pruned EfficientNet B1, B2, and B3 ( https://arxiv.org/abs/2002.08258 ) contributed by Yonathan Aflalo","title":"May 3, 2020"},{"location":"changes/#may-1-2020","text":"Merged a number of execellent contributions in the ResNet model family over the past month BlurPool2D and resnetblur models initiated by Chris Ha , I trained resnetblur50 to 79.3. TResNet models and SpaceToDepth, AntiAliasDownsampleLayer layers by mrT23 ecaresnet (50d, 101d, light) models and two pruned variants using pruning as per ( https://arxiv.org/abs/2002.08258 ) by Yonathan Aflalo 200 pretrained models in total now with updated results csv in results folder","title":"May 1, 2020"},{"location":"feature_extraction/","text":"Feature Extraction All of the models in timm have consistent mechanisms for obtaining various types of features from the model for tasks besides classification. Penultimate Layer Features (Pre-Classifier Features) The features from the penultimate model layer can be obtained in severay ways without requiring model surgery (although feel free to do surgery). One must first decide if they want pooled or un-pooled features. Unpooled There are three ways to obtain unpooled features. Without modifying the network, one can call model.forward_features(input) on any model instead of the usual model(input) . This will bypass the head classifier and global pooling for networks. If one wants to explicitly modify the network to return unpooled features, they can either create the model without a classifier and pooling, or remove it later. Both paths remove the parameters associated with the classifier from the network. forward_features() import torch import timm m = timm . create_model ( 'xception41' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Original shape: {o.shape}' ) o = m . forward_features ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 2048, 10, 10]) Create with no classifier and pooling import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 , global_pool = '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Unpooled shape: torch.Size([2, 2048, 7, 7]) Remove it later import torch import timm m = timm . create_model ( 'densenet121' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 , '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 1024, 7, 7]) Pooled To modify the network to return pooled features, one can use forward_features() and pool/flatten the result themselves, or modify the network like above but keep pooling intact. Create with no classifier import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 2048]) Remove it later import torch import timm m = timm . create_model ( 'ese_vovnet19b_dw' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 1024]) Multi-scale Feature Maps (Feature Pyramid) Object detection, segmentation, keypoint, and a variety of dense pixel tasks require access to feature maps from the backbone network at multiple scales. This is often done by modifying the original classification network. Since each network varies quite a bit in structure, it's not uncommon to see only a few backbones supported in any given obj detection or segmentation library. timm allows a consistent interface for creating any of the included models as feature backbones that output feature maps for selected levels. A feature backbone can be created by adding the argument features_only=True to any create_model call. By default 5 strides will be output from most models (not all have that many), with the first starting at 2 (some start at 1 or 4). Create a feature map extraction model import torch import timm m = timm . create_model ( 'resnest26d' , features_only = True , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: torch.Size([2, 64, 112, 112]) torch.Size([2, 256, 56, 56]) torch.Size([2, 512, 28, 28]) torch.Size([2, 1024, 14, 14]) torch.Size([2, 2048, 7, 7]) Query the feature information After a feature backbone has been created, it can be queried to provide channel or resolution reduction information to the downstream heads without requiring static config or hardcoded constants. The .feature_info attribute is a class encapsulating the information about the feature extraction points. import torch import timm m = timm . create_model ( 'regnety_032' , features_only = True , pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: Feature channels: [32, 72, 216, 576, 1512] torch.Size([2, 32, 112, 112]) torch.Size([2, 72, 56, 56]) torch.Size([2, 216, 28, 28]) torch.Size([2, 576, 14, 14]) torch.Size([2, 1512, 7, 7]) Select specific feature levels or limit the stride There are to additional creation arguments impacting the output features. out_indices selects which indices to output output_stride limits the feature output stride of the network (also works in classification mode BTW) out_indices is supported by all models, but not all models have the same index to feature stride mapping. Look at the code or check feature_info to compare. The out indices generally correspond to the C(i+1)th feature level (a 2^(i+1) reduction). For most models, index 0 is the stride 2 features, and index 4 is stride 32. output_stride is achieved by converting layers to use dilated convolutions. Doing so is not always straightforward, some networks only support output_stride=32 . import torch import timm m = timm . create_model ( 'ecaresnet101d' , features_only = True , output_stride = 8 , out_indices = ( 2 , 4 ), pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) print ( f 'Feature reduction: {m.feature_info.reduction()}' ) o = m ( torch . randn ( 2 , 3 , 320 , 320 )) for x in o : print ( x . shape ) Output: Feature channels: [512, 2048] Feature reduction: [8, 8] torch.Size([2, 512, 40, 40]) torch.Size([2, 2048, 40, 40])","title":"Feature Extraction"},{"location":"feature_extraction/#feature-extraction","text":"All of the models in timm have consistent mechanisms for obtaining various types of features from the model for tasks besides classification.","title":"Feature Extraction"},{"location":"feature_extraction/#penultimate-layer-features-pre-classifier-features","text":"The features from the penultimate model layer can be obtained in severay ways without requiring model surgery (although feel free to do surgery). One must first decide if they want pooled or un-pooled features.","title":"Penultimate Layer Features (Pre-Classifier Features)"},{"location":"feature_extraction/#unpooled","text":"There are three ways to obtain unpooled features. Without modifying the network, one can call model.forward_features(input) on any model instead of the usual model(input) . This will bypass the head classifier and global pooling for networks. If one wants to explicitly modify the network to return unpooled features, they can either create the model without a classifier and pooling, or remove it later. Both paths remove the parameters associated with the classifier from the network.","title":"Unpooled"},{"location":"feature_extraction/#forward_features","text":"import torch import timm m = timm . create_model ( 'xception41' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Original shape: {o.shape}' ) o = m . forward_features ( torch . randn ( 2 , 3 , 299 , 299 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 2048, 10, 10])","title":"forward_features()"},{"location":"feature_extraction/#create-with-no-classifier-and-pooling","text":"import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 , global_pool = '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Unpooled shape: torch.Size([2, 2048, 7, 7])","title":"Create with no classifier and pooling"},{"location":"feature_extraction/#remove-it-later","text":"import torch import timm m = timm . create_model ( 'densenet121' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 , '' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Unpooled shape: {o.shape}' ) Output: Original shape: torch.Size([2, 1000]) Unpooled shape: torch.Size([2, 1024, 7, 7])","title":"Remove it later"},{"location":"feature_extraction/#pooled","text":"To modify the network to return pooled features, one can use forward_features() and pool/flatten the result themselves, or modify the network like above but keep pooling intact.","title":"Pooled"},{"location":"feature_extraction/#create-with-no-classifier","text":"import torch import timm m = timm . create_model ( 'resnet50' , pretrained = True , num_classes = 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 2048])","title":"Create with no classifier"},{"location":"feature_extraction/#remove-it-later_1","text":"import torch import timm m = timm . create_model ( 'ese_vovnet19b_dw' , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Original shape: {o.shape}' ) m . reset_classifier ( 0 ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) print ( f 'Pooled shape: {o.shape}' ) Output: Pooled shape: torch.Size([2, 1024])","title":"Remove it later"},{"location":"feature_extraction/#multi-scale-feature-maps-feature-pyramid","text":"Object detection, segmentation, keypoint, and a variety of dense pixel tasks require access to feature maps from the backbone network at multiple scales. This is often done by modifying the original classification network. Since each network varies quite a bit in structure, it's not uncommon to see only a few backbones supported in any given obj detection or segmentation library. timm allows a consistent interface for creating any of the included models as feature backbones that output feature maps for selected levels. A feature backbone can be created by adding the argument features_only=True to any create_model call. By default 5 strides will be output from most models (not all have that many), with the first starting at 2 (some start at 1 or 4).","title":"Multi-scale Feature Maps (Feature Pyramid)"},{"location":"feature_extraction/#create-a-feature-map-extraction-model","text":"import torch import timm m = timm . create_model ( 'resnest26d' , features_only = True , pretrained = True ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: torch.Size([2, 64, 112, 112]) torch.Size([2, 256, 56, 56]) torch.Size([2, 512, 28, 28]) torch.Size([2, 1024, 14, 14]) torch.Size([2, 2048, 7, 7])","title":"Create a feature map extraction model"},{"location":"feature_extraction/#query-the-feature-information","text":"After a feature backbone has been created, it can be queried to provide channel or resolution reduction information to the downstream heads without requiring static config or hardcoded constants. The .feature_info attribute is a class encapsulating the information about the feature extraction points. import torch import timm m = timm . create_model ( 'regnety_032' , features_only = True , pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) o = m ( torch . randn ( 2 , 3 , 224 , 224 )) for x in o : print ( x . shape ) Output: Feature channels: [32, 72, 216, 576, 1512] torch.Size([2, 32, 112, 112]) torch.Size([2, 72, 56, 56]) torch.Size([2, 216, 28, 28]) torch.Size([2, 576, 14, 14]) torch.Size([2, 1512, 7, 7])","title":"Query the feature information"},{"location":"feature_extraction/#select-specific-feature-levels-or-limit-the-stride","text":"There are to additional creation arguments impacting the output features. out_indices selects which indices to output output_stride limits the feature output stride of the network (also works in classification mode BTW) out_indices is supported by all models, but not all models have the same index to feature stride mapping. Look at the code or check feature_info to compare. The out indices generally correspond to the C(i+1)th feature level (a 2^(i+1) reduction). For most models, index 0 is the stride 2 features, and index 4 is stride 32. output_stride is achieved by converting layers to use dilated convolutions. Doing so is not always straightforward, some networks only support output_stride=32 . import torch import timm m = timm . create_model ( 'ecaresnet101d' , features_only = True , output_stride = 8 , out_indices = ( 2 , 4 ), pretrained = True ) print ( f 'Feature channels: {m.feature_info.channels()}' ) print ( f 'Feature reduction: {m.feature_info.reduction()}' ) o = m ( torch . randn ( 2 , 3 , 320 , 320 )) for x in o : print ( x . shape ) Output: Feature channels: [512, 2048] Feature reduction: [8, 8] torch.Size([2, 512, 40, 40]) torch.Size([2, 2048, 40, 40])","title":"Select specific feature levels or limit the stride"},{"location":"models/","text":"Model Architectures The model architectures included come from a wide variety of sources. Sources, including papers, original impl (\"reference code\") that I rewrote / adapted, and PyTorch impl that I leveraged directly (\"code\") are listed below. Most included models have pretrained weights. The weights are either: from their original sources ported by myself from their original impl in a different framework (e.g. Tensorflow models) trained from scratch using the included training script The validation results for the pretrained weights can be found here Cross-Stage Partial Networks [ cspnet.py ] Paper: CSPNet: A New Backbone that can Enhance Learning Capability of CNN - https://arxiv.org/abs/1911.11929 Reference impl: https://github.com/WongKinYiu/CrossStagePartialNetworks DenseNet [ densenet.py ] Paper: Densely Connected Convolutional Networks - https://arxiv.org/abs/1608.06993 Code: https://github.com/pytorch/vision/tree/master/torchvision/models DLA [ dla.py ] Paper: https://arxiv.org/abs/1707.06484 Code: https://github.com/ucbdrive/dla Dual-Path Networks [ dpn.py ] Paper: Dual Path Networks - https://arxiv.org/abs/1707.01629 My PyTorch code: https://github.com/rwightman/pytorch-dpn-pretrained Reference code: https://github.com/cypw/DPNs HRNet [ hrnet.py ] Paper: Deep High-Resolution Representation Learning for Visual Recognition - https://arxiv.org/abs/1908.07919 Code: https://github.com/HRNet/HRNet-Image-Classification Inception-V3 [ inception_v3.py ] Paper: Rethinking the Inception Architecture for Computer Vision - https://arxiv.org/abs/1512.00567 Code: https://github.com/pytorch/vision/tree/master/torchvision/models Inception-V4 [ inception_v4.py ] Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets Inception-ResNet-V2 [ inception_resnet_v2.py ] Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets NASNet-A [ nasnet.py ] Papers: Learning Transferable Architectures for Scalable Image Recognition - https://arxiv.org/abs/1707.07012 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet PNasNet-5 [ pnasnet.py ] Papers: Progressive Neural Architecture Search - https://arxiv.org/abs/1712.00559 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet EfficientNet [ efficientnet.py ] Papers: EfficientNet NoisyStudent (B0-B7, L2) - https://arxiv.org/abs/1911.04252 EfficientNet AdvProp (B0-B8) - https://arxiv.org/abs/1911.09665 EfficientNet (B0-B7) - https://arxiv.org/abs/1905.11946 EfficientNet-EdgeTPU (S, M, L) - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html MixNet - https://arxiv.org/abs/1907.09595 MNASNet B1, A1 (Squeeze-Excite), and Small - https://arxiv.org/abs/1807.11626 MobileNet-V2 - https://arxiv.org/abs/1801.04381 FBNet-C - https://arxiv.org/abs/1812.03443 Single-Path NAS - https://arxiv.org/abs/1904.02877 My PyTorch code: https://github.com/rwightman/gen-efficientnet-pytorch Reference code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet MobileNet-V3 [ mobilenetv3.py ] Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet RegNet [ regnet.py ] Paper: Designing Network Design Spaces - https://arxiv.org/abs/2003.13678 Reference code: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py ResNet, ResNeXt [ resnet.py ] ResNet (V1B) Paper: Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 Code: https://github.com/pytorch/vision/tree/master/torchvision/models ResNeXt Paper: Aggregated Residual Transformations for Deep Neural Networks - https://arxiv.org/abs/1611.05431 Code: https://github.com/pytorch/vision/tree/master/torchvision/models 'Bag of Tricks' / Gluon C, D, E, S ResNet variants Paper: Bag of Tricks for Image Classification with CNNs - https://arxiv.org/abs/1812.01187 Code: https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnetv1b.py Instagram pretrained / ImageNet tuned ResNeXt101 Paper: Exploring the Limits of Weakly Supervised Pretraining - https://arxiv.org/abs/1805.00932 Weights: https://pytorch.org/hub/facebookresearch_WSL-Images_resnext (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Semi-supervised (SSL) / Semi-weakly Supervised (SWSL) ResNet and ResNeXts Paper: Billion-scale semi-supervised learning for image classification - https://arxiv.org/abs/1905.00546 Weights: https://github.com/facebookresearch/semi-supervised-ImageNet1K-models (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Squeeze-and-Excitation Networks Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: Added to ResNet base, this is current version going forward, old senet.py is being deprecated ECAResNet (ECA-Net) Paper: ECA-Net: Efficient Channel Attention for Deep CNN - https://arxiv.org/abs/1910.03151v4 Code: Added to ResNet base, ECA module contributed by @VRandme, reference https://github.com/BangguWu/ECANet Res2Net [ res2net.py ] Paper: Res2Net: A New Multi-scale Backbone Architecture - https://arxiv.org/abs/1904.01169 Code: https://github.com/gasvn/Res2Net ResNeSt [ resnest.py ] Paper: ResNeSt: Split-Attention Networks - https://arxiv.org/abs/2004.08955 Code: https://github.com/zhanghang1989/ResNeSt ReXNet [ rexnet.py ] Paper: ReXNet: Diminishing Representational Bottleneck on CNN - https://arxiv.org/abs/2007.00992 Code: https://github.com/clovaai/rexnet Selective-Kernel Networks [ sknet.py ] Paper: Selective-Kernel Networks - https://arxiv.org/abs/1903.06586 Code: https://github.com/implus/SKNet , https://github.com/clovaai/assembled-cnn SelecSLS [ selecsls.py ] Paper: XNect: Real-time Multi-Person 3D Motion Capture with a Single RGB Camera - https://arxiv.org/abs/1907.00837 Code: https://github.com/mehtadushy/SelecSLS-Pytorch Squeeze-and-Excitation Networks [ senet.py ] NOTE: I am deprecating this version of the networks, the new ones are part of resnet.py Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: https://github.com/Cadene/pretrained-models.pytorch TResNet [ tresnet.py ] Paper: TResNet: High Performance GPU-Dedicated Architecture - https://arxiv.org/abs/2003.13630 Code: https://github.com/mrT23/TResNet Vision Transformer [ vision_transformer.py ] Paper: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - https://arxiv.org/abs/2010.11929 Reference code and pretrained weights: https://github.com/google-research/vision_transformer VovNet V2 and V1 [ vovnet.py ] Paper: CenterMask : Real-Time Anchor-Free Instance Segmentation - https://arxiv.org/abs/1911.06667 Reference code: https://github.com/youngwanLEE/vovnet-detectron2 Xception [ xception.py ] Paper: Xception: Deep Learning with Depthwise Separable Convolutions - https://arxiv.org/abs/1610.02357 Code: https://github.com/Cadene/pretrained-models.pytorch Xception (Modified Aligned, Gluon) [ gluon_xception.py ] Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/dmlc/gluon-cv/tree/master/gluoncv/model_zoo , https://github.com/jfzhang95/pytorch-deeplab-xception/ Xception (Modified Aligned, TF) [ aligned_xception.py ] Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/tensorflow/models/tree/master/research/deeplab","title":"Model Architectures"},{"location":"models/#model-architectures","text":"The model architectures included come from a wide variety of sources. Sources, including papers, original impl (\"reference code\") that I rewrote / adapted, and PyTorch impl that I leveraged directly (\"code\") are listed below. Most included models have pretrained weights. The weights are either: from their original sources ported by myself from their original impl in a different framework (e.g. Tensorflow models) trained from scratch using the included training script The validation results for the pretrained weights can be found here","title":"Model Architectures"},{"location":"models/#cross-stage-partial-networks-cspnetpy","text":"Paper: CSPNet: A New Backbone that can Enhance Learning Capability of CNN - https://arxiv.org/abs/1911.11929 Reference impl: https://github.com/WongKinYiu/CrossStagePartialNetworks","title":"Cross-Stage Partial Networks [cspnet.py]"},{"location":"models/#densenet-densenetpy","text":"Paper: Densely Connected Convolutional Networks - https://arxiv.org/abs/1608.06993 Code: https://github.com/pytorch/vision/tree/master/torchvision/models","title":"DenseNet [densenet.py]"},{"location":"models/#dla-dlapy","text":"Paper: https://arxiv.org/abs/1707.06484 Code: https://github.com/ucbdrive/dla","title":"DLA [dla.py]"},{"location":"models/#dual-path-networks-dpnpy","text":"Paper: Dual Path Networks - https://arxiv.org/abs/1707.01629 My PyTorch code: https://github.com/rwightman/pytorch-dpn-pretrained Reference code: https://github.com/cypw/DPNs","title":"Dual-Path Networks [dpn.py]"},{"location":"models/#hrnet-hrnetpy","text":"Paper: Deep High-Resolution Representation Learning for Visual Recognition - https://arxiv.org/abs/1908.07919 Code: https://github.com/HRNet/HRNet-Image-Classification","title":"HRNet [hrnet.py]"},{"location":"models/#inception-v3-inception_v3py","text":"Paper: Rethinking the Inception Architecture for Computer Vision - https://arxiv.org/abs/1512.00567 Code: https://github.com/pytorch/vision/tree/master/torchvision/models","title":"Inception-V3 [inception_v3.py]"},{"location":"models/#inception-v4-inception_v4py","text":"Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets","title":"Inception-V4 [inception_v4.py]"},{"location":"models/#inception-resnet-v2-inception_resnet_v2py","text":"Paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - https://arxiv.org/abs/1602.07261 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets","title":"Inception-ResNet-V2 [inception_resnet_v2.py]"},{"location":"models/#nasnet-a-nasnetpy","text":"Papers: Learning Transferable Architectures for Scalable Image Recognition - https://arxiv.org/abs/1707.07012 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet","title":"NASNet-A [nasnet.py]"},{"location":"models/#pnasnet-5-pnasnetpy","text":"Papers: Progressive Neural Architecture Search - https://arxiv.org/abs/1712.00559 Code: https://github.com/Cadene/pretrained-models.pytorch Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet","title":"PNasNet-5 [pnasnet.py]"},{"location":"models/#efficientnet-efficientnetpy","text":"Papers: EfficientNet NoisyStudent (B0-B7, L2) - https://arxiv.org/abs/1911.04252 EfficientNet AdvProp (B0-B8) - https://arxiv.org/abs/1911.09665 EfficientNet (B0-B7) - https://arxiv.org/abs/1905.11946 EfficientNet-EdgeTPU (S, M, L) - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html MixNet - https://arxiv.org/abs/1907.09595 MNASNet B1, A1 (Squeeze-Excite), and Small - https://arxiv.org/abs/1807.11626 MobileNet-V2 - https://arxiv.org/abs/1801.04381 FBNet-C - https://arxiv.org/abs/1812.03443 Single-Path NAS - https://arxiv.org/abs/1904.02877 My PyTorch code: https://github.com/rwightman/gen-efficientnet-pytorch Reference code: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet","title":"EfficientNet [efficientnet.py]"},{"location":"models/#mobilenet-v3-mobilenetv3py","text":"Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 Reference code: https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet","title":"MobileNet-V3 [mobilenetv3.py]"},{"location":"models/#regnet-regnetpy","text":"Paper: Designing Network Design Spaces - https://arxiv.org/abs/2003.13678 Reference code: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py","title":"RegNet [regnet.py]"},{"location":"models/#resnet-resnext-resnetpy","text":"ResNet (V1B) Paper: Deep Residual Learning for Image Recognition - https://arxiv.org/abs/1512.03385 Code: https://github.com/pytorch/vision/tree/master/torchvision/models ResNeXt Paper: Aggregated Residual Transformations for Deep Neural Networks - https://arxiv.org/abs/1611.05431 Code: https://github.com/pytorch/vision/tree/master/torchvision/models 'Bag of Tricks' / Gluon C, D, E, S ResNet variants Paper: Bag of Tricks for Image Classification with CNNs - https://arxiv.org/abs/1812.01187 Code: https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnetv1b.py Instagram pretrained / ImageNet tuned ResNeXt101 Paper: Exploring the Limits of Weakly Supervised Pretraining - https://arxiv.org/abs/1805.00932 Weights: https://pytorch.org/hub/facebookresearch_WSL-Images_resnext (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Semi-supervised (SSL) / Semi-weakly Supervised (SWSL) ResNet and ResNeXts Paper: Billion-scale semi-supervised learning for image classification - https://arxiv.org/abs/1905.00546 Weights: https://github.com/facebookresearch/semi-supervised-ImageNet1K-models (NOTE: CC BY-NC 4.0 License, NOT commercial friendly) Squeeze-and-Excitation Networks Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: Added to ResNet base, this is current version going forward, old senet.py is being deprecated ECAResNet (ECA-Net) Paper: ECA-Net: Efficient Channel Attention for Deep CNN - https://arxiv.org/abs/1910.03151v4 Code: Added to ResNet base, ECA module contributed by @VRandme, reference https://github.com/BangguWu/ECANet","title":"ResNet, ResNeXt [resnet.py]"},{"location":"models/#res2net-res2netpy","text":"Paper: Res2Net: A New Multi-scale Backbone Architecture - https://arxiv.org/abs/1904.01169 Code: https://github.com/gasvn/Res2Net","title":"Res2Net [res2net.py]"},{"location":"models/#resnest-resnestpy","text":"Paper: ResNeSt: Split-Attention Networks - https://arxiv.org/abs/2004.08955 Code: https://github.com/zhanghang1989/ResNeSt","title":"ResNeSt [resnest.py]"},{"location":"models/#rexnet-rexnetpy","text":"Paper: ReXNet: Diminishing Representational Bottleneck on CNN - https://arxiv.org/abs/2007.00992 Code: https://github.com/clovaai/rexnet","title":"ReXNet [rexnet.py]"},{"location":"models/#selective-kernel-networks-sknetpy","text":"Paper: Selective-Kernel Networks - https://arxiv.org/abs/1903.06586 Code: https://github.com/implus/SKNet , https://github.com/clovaai/assembled-cnn","title":"Selective-Kernel Networks [sknet.py]"},{"location":"models/#selecsls-selecslspy","text":"Paper: XNect: Real-time Multi-Person 3D Motion Capture with a Single RGB Camera - https://arxiv.org/abs/1907.00837 Code: https://github.com/mehtadushy/SelecSLS-Pytorch","title":"SelecSLS [selecsls.py]"},{"location":"models/#squeeze-and-excitation-networks-senetpy","text":"NOTE: I am deprecating this version of the networks, the new ones are part of resnet.py Paper: Squeeze-and-Excitation Networks - https://arxiv.org/abs/1709.01507 Code: https://github.com/Cadene/pretrained-models.pytorch","title":"Squeeze-and-Excitation Networks [senet.py]"},{"location":"models/#tresnet-tresnetpy","text":"Paper: TResNet: High Performance GPU-Dedicated Architecture - https://arxiv.org/abs/2003.13630 Code: https://github.com/mrT23/TResNet","title":"TResNet [tresnet.py]"},{"location":"models/#vision-transformer-vision_transformerpy","text":"Paper: An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - https://arxiv.org/abs/2010.11929 Reference code and pretrained weights: https://github.com/google-research/vision_transformer","title":"Vision Transformer [vision_transformer.py]"},{"location":"models/#vovnet-v2-and-v1-vovnetpy","text":"Paper: CenterMask : Real-Time Anchor-Free Instance Segmentation - https://arxiv.org/abs/1911.06667 Reference code: https://github.com/youngwanLEE/vovnet-detectron2","title":"VovNet V2 and V1 [vovnet.py]"},{"location":"models/#xception-xceptionpy","text":"Paper: Xception: Deep Learning with Depthwise Separable Convolutions - https://arxiv.org/abs/1610.02357 Code: https://github.com/Cadene/pretrained-models.pytorch","title":"Xception [xception.py]"},{"location":"models/#xception-modified-aligned-gluon-gluon_xceptionpy","text":"Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/dmlc/gluon-cv/tree/master/gluoncv/model_zoo , https://github.com/jfzhang95/pytorch-deeplab-xception/","title":"Xception (Modified Aligned, Gluon) [gluon_xception.py]"},{"location":"models/#xception-modified-aligned-tf-aligned_xceptionpy","text":"Paper: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation - https://arxiv.org/abs/1802.02611 Reference code: https://github.com/tensorflow/models/tree/master/research/deeplab","title":"Xception (Modified Aligned, TF) [aligned_xception.py]"},{"location":"results/","text":"Results CSV files containing an ImageNet-1K validation and out-of-distribution (OOD) test set validation results for all included models with pretrained weights and default configurations is located here . Self-trained Weights I've leveraged the training scripts in this repository to train a few of the models with to good levels of performance. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size efficientnet_b3a 82.242 (17.758) 96.114 (3.886) 12.23 bicubic 320 (1.0 crop) efficientnet_b3 82.076 (17.924) 96.020 (3.980) 12.23 bicubic 300 regnet_32 82.002 (17.998) 95.906 (4.094) 19.44 bicubic 224 skresnext50d_32x4d 81.278 (18.722) 95.366 (4.634) 27.5 bicubic 288 (1.0 crop) seresnext50d_32x4d 81.266 (18.734) 95.620 (4.380) 27.6 bicubic 224 efficientnet_b2a 80.608 (19.392) 95.310 (4.690) 9.11 bicubic 288 (1.0 crop) resnet50d 80.530 (19.470) 95.160 (4.840) 25.6 bicubic 224 mixnet_xl 80.478 (19.522) 94.932 (5.068) 11.90 bicubic 224 efficientnet_b2 80.402 (19.598) 95.076 (4.924) 9.11 bicubic 260 seresnet50 80.274 (19.726) 95.070 (4.930) 28.1 bicubic 224 skresnext50d_32x4d 80.156 (19.844) 94.642 (5.358) 27.5 bicubic 224 cspdarknet53 80.058 (19.942) 95.084 (4.916) 27.6 bicubic 256 cspresnext50 80.040 (19.960) 94.944 (5.056) 20.6 bicubic 224 resnext50_32x4d 79.762 (20.238) 94.600 (5.400) 25 bicubic 224 resnext50d_32x4d 79.674 (20.326) 94.868 (5.132) 25.1 bicubic 224 cspresnet50 79.574 (20.426) 94.712 (5.288) 21.6 bicubic 256 ese_vovnet39b 79.320 (20.680) 94.710 (5.290) 24.6 bicubic 224 resnetblur50 79.290 (20.710) 94.632 (5.368) 25.6 bicubic 224 dpn68b 79.216 (20.784) 94.414 (5.586) 12.6 bicubic 224 resnet50 79.038 (20.962) 94.390 (5.610) 25.6 bicubic 224 mixnet_l 78.976 (21.024 94.184 (5.816) 7.33 bicubic 224 efficientnet_b1 78.692 (21.308) 94.086 (5.914) 7.79 bicubic 240 efficientnet_es 78.066 (21.934) 93.926 (6.074) 5.44 bicubic 224 seresnext26t_32x4d 77.998 (22.002) 93.708 (6.292) 16.8 bicubic 224 seresnext26tn_32x4d 77.986 (22.014) 93.746 (6.254) 16.8 bicubic 224 efficientnet_b0 77.698 (22.302) 93.532 (6.468) 5.29 bicubic 224 seresnext26d_32x4d 77.602 (22.398) 93.608 (6.392) 16.8 bicubic 224 mobilenetv2_120d 77.294 (22.706 93.502 (6.498) 5.8 bicubic 224 mixnet_m 77.256 (22.744) 93.418 (6.582) 5.01 bicubic 224 resnet34d 77.116 (22.884) 93.382 (6.618) 21.8 bicubic 224 seresnext26_32x4d 77.104 (22.896) 93.316 (6.684) 16.8 bicubic 224 skresnet34 76.912 (23.088) 93.322 (6.678) 22.2 bicubic 224 ese_vovnet19b_dw 76.798 (23.202) 93.268 (6.732) 6.5 bicubic 224 resnet26d 76.68 (23.32) 93.166 (6.834) 16 bicubic 224 densenetblur121d 76.576 (23.424) 93.190 (6.810) 8.0 bicubic 224 mobilenetv2_140 76.524 (23.476) 92.990 (7.010) 6.1 bicubic 224 mixnet_s 75.988 (24.012) 92.794 (7.206) 4.13 bicubic 224 mobilenetv3_large_100 75.766 (24.234) 92.542 (7.458) 5.5 bicubic 224 mobilenetv3_rw 75.634 (24.366) 92.708 (7.292) 5.5 bicubic 224 mnasnet_a1 75.448 (24.552) 92.604 (7.396) 3.89 bicubic 224 resnet26 75.292 (24.708) 92.57 (7.43) 16 bicubic 224 fbnetc_100 75.124 (24.876) 92.386 (7.614) 5.6 bilinear 224 resnet34 75.110 (24.890) 92.284 (7.716) 22 bilinear 224 mobilenetv2_110d 75.052 (24.948) 92.180 (7.820) 4.5 bicubic 224 seresnet34 74.808 (25.192) 92.124 (7.876) 22 bilinear 224 mnasnet_b1 74.658 (25.342) 92.114 (7.886) 4.38 bicubic 224 spnasnet_100 74.084 (25.916) 91.818 (8.182) 4.42 bilinear 224 skresnet18 73.038 (26.962) 91.168 (8.832) 11.9 bicubic 224 mobilenetv2_100 72.978 (27.022) 91.016 (8.984) 3.5 bicubic 224 resnet18d 72.260 (27.740) 90.696 (9.304) 11.7 bicubic 224 seresnet18 71.742 (28.258) 90.334 (9.666) 11.8 bicubic 224 Ported and Other Weights For weights ported from other deep learning frameworks (Tensorflow, MXNet GluonCV) or copied from other PyTorch sources, please see the full results tables for ImageNet and various OOD test sets at in the results tables . Model code .py files contain links to original sources of models and weights.","title":"Results"},{"location":"results/#results","text":"CSV files containing an ImageNet-1K validation and out-of-distribution (OOD) test set validation results for all included models with pretrained weights and default configurations is located here .","title":"Results"},{"location":"results/#self-trained-weights","text":"I've leveraged the training scripts in this repository to train a few of the models with to good levels of performance. Model Acc@1 (Err) Acc@5 (Err) Param # (M) Interpolation Image Size efficientnet_b3a 82.242 (17.758) 96.114 (3.886) 12.23 bicubic 320 (1.0 crop) efficientnet_b3 82.076 (17.924) 96.020 (3.980) 12.23 bicubic 300 regnet_32 82.002 (17.998) 95.906 (4.094) 19.44 bicubic 224 skresnext50d_32x4d 81.278 (18.722) 95.366 (4.634) 27.5 bicubic 288 (1.0 crop) seresnext50d_32x4d 81.266 (18.734) 95.620 (4.380) 27.6 bicubic 224 efficientnet_b2a 80.608 (19.392) 95.310 (4.690) 9.11 bicubic 288 (1.0 crop) resnet50d 80.530 (19.470) 95.160 (4.840) 25.6 bicubic 224 mixnet_xl 80.478 (19.522) 94.932 (5.068) 11.90 bicubic 224 efficientnet_b2 80.402 (19.598) 95.076 (4.924) 9.11 bicubic 260 seresnet50 80.274 (19.726) 95.070 (4.930) 28.1 bicubic 224 skresnext50d_32x4d 80.156 (19.844) 94.642 (5.358) 27.5 bicubic 224 cspdarknet53 80.058 (19.942) 95.084 (4.916) 27.6 bicubic 256 cspresnext50 80.040 (19.960) 94.944 (5.056) 20.6 bicubic 224 resnext50_32x4d 79.762 (20.238) 94.600 (5.400) 25 bicubic 224 resnext50d_32x4d 79.674 (20.326) 94.868 (5.132) 25.1 bicubic 224 cspresnet50 79.574 (20.426) 94.712 (5.288) 21.6 bicubic 256 ese_vovnet39b 79.320 (20.680) 94.710 (5.290) 24.6 bicubic 224 resnetblur50 79.290 (20.710) 94.632 (5.368) 25.6 bicubic 224 dpn68b 79.216 (20.784) 94.414 (5.586) 12.6 bicubic 224 resnet50 79.038 (20.962) 94.390 (5.610) 25.6 bicubic 224 mixnet_l 78.976 (21.024 94.184 (5.816) 7.33 bicubic 224 efficientnet_b1 78.692 (21.308) 94.086 (5.914) 7.79 bicubic 240 efficientnet_es 78.066 (21.934) 93.926 (6.074) 5.44 bicubic 224 seresnext26t_32x4d 77.998 (22.002) 93.708 (6.292) 16.8 bicubic 224 seresnext26tn_32x4d 77.986 (22.014) 93.746 (6.254) 16.8 bicubic 224 efficientnet_b0 77.698 (22.302) 93.532 (6.468) 5.29 bicubic 224 seresnext26d_32x4d 77.602 (22.398) 93.608 (6.392) 16.8 bicubic 224 mobilenetv2_120d 77.294 (22.706 93.502 (6.498) 5.8 bicubic 224 mixnet_m 77.256 (22.744) 93.418 (6.582) 5.01 bicubic 224 resnet34d 77.116 (22.884) 93.382 (6.618) 21.8 bicubic 224 seresnext26_32x4d 77.104 (22.896) 93.316 (6.684) 16.8 bicubic 224 skresnet34 76.912 (23.088) 93.322 (6.678) 22.2 bicubic 224 ese_vovnet19b_dw 76.798 (23.202) 93.268 (6.732) 6.5 bicubic 224 resnet26d 76.68 (23.32) 93.166 (6.834) 16 bicubic 224 densenetblur121d 76.576 (23.424) 93.190 (6.810) 8.0 bicubic 224 mobilenetv2_140 76.524 (23.476) 92.990 (7.010) 6.1 bicubic 224 mixnet_s 75.988 (24.012) 92.794 (7.206) 4.13 bicubic 224 mobilenetv3_large_100 75.766 (24.234) 92.542 (7.458) 5.5 bicubic 224 mobilenetv3_rw 75.634 (24.366) 92.708 (7.292) 5.5 bicubic 224 mnasnet_a1 75.448 (24.552) 92.604 (7.396) 3.89 bicubic 224 resnet26 75.292 (24.708) 92.57 (7.43) 16 bicubic 224 fbnetc_100 75.124 (24.876) 92.386 (7.614) 5.6 bilinear 224 resnet34 75.110 (24.890) 92.284 (7.716) 22 bilinear 224 mobilenetv2_110d 75.052 (24.948) 92.180 (7.820) 4.5 bicubic 224 seresnet34 74.808 (25.192) 92.124 (7.876) 22 bilinear 224 mnasnet_b1 74.658 (25.342) 92.114 (7.886) 4.38 bicubic 224 spnasnet_100 74.084 (25.916) 91.818 (8.182) 4.42 bilinear 224 skresnet18 73.038 (26.962) 91.168 (8.832) 11.9 bicubic 224 mobilenetv2_100 72.978 (27.022) 91.016 (8.984) 3.5 bicubic 224 resnet18d 72.260 (27.740) 90.696 (9.304) 11.7 bicubic 224 seresnet18 71.742 (28.258) 90.334 (9.666) 11.8 bicubic 224","title":"Self-trained Weights"},{"location":"results/#ported-and-other-weights","text":"For weights ported from other deep learning frameworks (Tensorflow, MXNet GluonCV) or copied from other PyTorch sources, please see the full results tables for ImageNet and various OOD test sets at in the results tables . Model code .py files contain links to original sources of models and weights.","title":"Ported and Other Weights"},{"location":"scripts/","text":"Scripts A train, validation, inference, and checkpoint cleaning script included in the github root folder. Scripts are not currently packaged in the pip release. The training and validation scripts evolved from early versions of the PyTorch Imagenet Examples . I have added significant functionality over time, including CUDA specific performance enhancements based on NVIDIA's APEX Examples . Training Script The variety of training args is large and not all combinations of options (or even options) have been fully tested. For the training dataset folder, specify the folder to the base that contains a train and validation folder. To train an SE-ResNet34 on ImageNet, locally distributed, 4 GPUs, one process per GPU w/ cosine schedule, random-erasing prob of 50% and per-pixel random value: ./distributed_train.sh 4 /data/imagenet --model seresnet34 --sched cosine --epochs 150 --warmup-epochs 5 --lr 0.4 --reprob 0.5 --remode pixel --batch-size 256 -j 4 NOTE: NVIDIA APEX should be installed to run in per-process distributed via DDP or to enable AMP mixed precision with the --amp flag Validation / Inference Scripts Validation and inference scripts are similar in usage. One outputs metrics on a validation set and the other outputs topk class ids in a csv. Specify the folder containing validation images, not the base as in training script. To validate with the model's pretrained weights (if they exist): python validate.py /imagenet/validation/ --model seresnext26_32x4d --pretrained To run inference from a checkpoint: python inference.py /imagenet/validation/ --model mobilenetv3_large_100 --checkpoint ./output/model_best.pth.tar","title":"Scripts"},{"location":"scripts/#scripts","text":"A train, validation, inference, and checkpoint cleaning script included in the github root folder. Scripts are not currently packaged in the pip release. The training and validation scripts evolved from early versions of the PyTorch Imagenet Examples . I have added significant functionality over time, including CUDA specific performance enhancements based on NVIDIA's APEX Examples .","title":"Scripts"},{"location":"scripts/#training-script","text":"The variety of training args is large and not all combinations of options (or even options) have been fully tested. For the training dataset folder, specify the folder to the base that contains a train and validation folder. To train an SE-ResNet34 on ImageNet, locally distributed, 4 GPUs, one process per GPU w/ cosine schedule, random-erasing prob of 50% and per-pixel random value: ./distributed_train.sh 4 /data/imagenet --model seresnet34 --sched cosine --epochs 150 --warmup-epochs 5 --lr 0.4 --reprob 0.5 --remode pixel --batch-size 256 -j 4 NOTE: NVIDIA APEX should be installed to run in per-process distributed via DDP or to enable AMP mixed precision with the --amp flag","title":"Training Script"},{"location":"scripts/#validation-inference-scripts","text":"Validation and inference scripts are similar in usage. One outputs metrics on a validation set and the other outputs topk class ids in a csv. Specify the folder containing validation images, not the base as in training script. To validate with the model's pretrained weights (if they exist): python validate.py /imagenet/validation/ --model seresnext26_32x4d --pretrained To run inference from a checkpoint: python inference.py /imagenet/validation/ --model mobilenetv3_large_100 --checkpoint ./output/model_best.pth.tar","title":"Validation / Inference Scripts"},{"location":"training_hparam_examples/","text":"Training Examples EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5 These params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016 MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5 This params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce SE-ResNeXt-26-D and SE-ResNeXt-26-T These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards: ./distributed_train.sh 2 /imagenet/ --model seresnext26t_32x4d --lr 0.1 --warmup-epochs 5 --epochs 160 --weight-decay 1e-4 --sched cosine --reprob 0.4 --remode pixel -b 112 EfficientNet-B3 with RandAugment - 81.5 top-1, 95.7 top-5 The training of this model started with the same command line as EfficientNet-B2 w/ RA above. After almost three weeks of training the process crashed. The results weren't looking amazing so I resumed the training several times with tweaks to a few params (increase RE prob, decrease rand-aug, increase ema-decay). Nothing looked great. I ended up averaging the best checkpoints from all restarts. The result is mediocre at default res/crop but oddly performs much better with a full image test crop of 1.0. EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5 Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2. ./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048 ResNet50 with JSD loss and RandAugment (clean + 2x RA augs) - 79.04 top-1, 94.39 top-5 Trained on two older 1080Ti cards, this took a while. Only slightly, non statistically better ImageNet validation result than my first good AugMix training of 78.99. However, these weights are more robust on tests with ImageNetV2, ImageNet-Sketch, etc. Unlike my first AugMix runs, I've enabled SplitBatchNorm, disabled random erasing on the clean split, and cranked up random erasing prob on the 2 augmented paths. ./distributed_train.sh 2 /imagenet -b 64 --model resnet50 --sched cosine --epochs 200 --lr 0.05 --amp --remode pixel --reprob 0.6 --aug-splits 3 --aa rand-m9-mstd0.5-inc1 --resplit --split-bn --jsd --dist-bn reduce EfficientNet-ES (EdgeTPU-Small) with RandAugment - 78.066 top-1, 93.926 top-5 Trained by Andrew Lavin with 8 V100 cards. Model EMA was not used, final checkpoint is the average of 8 best checkpoints during training. ./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5 ./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9 ResNeXt-50 32x4d w/ RandAugment - 79.762 top-1, 94.60 top-5 These params will also work well for SE-ResNeXt-50 and SK-ResNeXt-50 and likely 101. I used them for the SK-ResNeXt-50 32x4d that I trained with 2 GPU using a slightly higher LR per effective batch size (lr=0.18, b=192 per GPU). The cmd line below are tuned for 8 GPU training. ./distributed_train.sh 8 /imagenet --model resnext50_32x4d --lr 0.6 --warmup-epochs 5 --epochs 240 --weight-decay 1e-4 --sched cosine --reprob 0.4 --recount 3 --remode pixel --aa rand-m7-mstd0.5-inc1 -b 192 -j 6 --amp --dist-bn reduce","title":"Training Examples"},{"location":"training_hparam_examples/#training-examples","text":"","title":"Training Examples"},{"location":"training_hparam_examples/#efficientnet-b2-with-randaugment-804-top-1-951-top-5","text":"These params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016","title":"EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5"},{"location":"training_hparam_examples/#mixnet-xl-with-randaugment-805-top-1-949-top-5","text":"This params are for dual Titan RTX cards with NVIDIA Apex installed: ./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce","title":"MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5"},{"location":"training_hparam_examples/#se-resnext-26-d-and-se-resnext-26-t","text":"These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards: ./distributed_train.sh 2 /imagenet/ --model seresnext26t_32x4d --lr 0.1 --warmup-epochs 5 --epochs 160 --weight-decay 1e-4 --sched cosine --reprob 0.4 --remode pixel -b 112","title":"SE-ResNeXt-26-D and SE-ResNeXt-26-T"},{"location":"training_hparam_examples/#efficientnet-b3-with-randaugment-815-top-1-957-top-5","text":"The training of this model started with the same command line as EfficientNet-B2 w/ RA above. After almost three weeks of training the process crashed. The results weren't looking amazing so I resumed the training several times with tweaks to a few params (increase RE prob, decrease rand-aug, increase ema-decay). Nothing looked great. I ended up averaging the best checkpoints from all restarts. The result is mediocre at default res/crop but oddly performs much better with a full image test crop of 1.0.","title":"EfficientNet-B3 with RandAugment - 81.5 top-1, 95.7 top-5"},{"location":"training_hparam_examples/#efficientnet-b0-with-randaugment-777-top-1-953-top-5","text":"Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2. ./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048","title":"EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5"},{"location":"training_hparam_examples/#resnet50-with-jsd-loss-and-randaugment-clean-2x-ra-augs-7904-top-1-9439-top-5","text":"Trained on two older 1080Ti cards, this took a while. Only slightly, non statistically better ImageNet validation result than my first good AugMix training of 78.99. However, these weights are more robust on tests with ImageNetV2, ImageNet-Sketch, etc. Unlike my first AugMix runs, I've enabled SplitBatchNorm, disabled random erasing on the clean split, and cranked up random erasing prob on the 2 augmented paths. ./distributed_train.sh 2 /imagenet -b 64 --model resnet50 --sched cosine --epochs 200 --lr 0.05 --amp --remode pixel --reprob 0.6 --aug-splits 3 --aa rand-m9-mstd0.5-inc1 --resplit --split-bn --jsd --dist-bn reduce","title":"ResNet50 with JSD loss and RandAugment (clean + 2x RA augs) - 79.04 top-1, 94.39 top-5"},{"location":"training_hparam_examples/#efficientnet-es-edgetpu-small-with-randaugment-78066-top-1-93926-top-5","text":"Trained by Andrew Lavin with 8 V100 cards. Model EMA was not used, final checkpoint is the average of 8 best checkpoints during training. ./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064","title":"EfficientNet-ES (EdgeTPU-Small) with RandAugment - 78.066 top-1, 93.926 top-5"},{"location":"training_hparam_examples/#mobilenetv3-large-100-75766-top-1-92542-top-5","text":"./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9","title":"MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5"},{"location":"training_hparam_examples/#resnext-50-32x4d-w-randaugment-79762-top-1-9460-top-5","text":"These params will also work well for SE-ResNeXt-50 and SK-ResNeXt-50 and likely 101. I used them for the SK-ResNeXt-50 32x4d that I trained with 2 GPU using a slightly higher LR per effective batch size (lr=0.18, b=192 per GPU). The cmd line below are tuned for 8 GPU training. ./distributed_train.sh 8 /imagenet --model resnext50_32x4d --lr 0.6 --warmup-epochs 5 --epochs 240 --weight-decay 1e-4 --sched cosine --reprob 0.4 --recount 3 --remode pixel --aa rand-m7-mstd0.5-inc1 -b 192 -j 6 --amp --dist-bn reduce","title":"ResNeXt-50 32x4d w/ RandAugment - 79.762 top-1, 94.60 top-5"}]}
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
index 082fbf68..6e192ffa 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -1,35 +1,35 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url><url>
      <loc>None</loc>
-     <lastmod>2020-08-06</lastmod>
+     <lastmod>2020-10-30</lastmod>
      <changefreq>daily</changefreq>
     </url>
 </urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index bc8ebb52..19dbce73 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ

efficientnet_b3a	81.874 (18.126)	95.840 (4.160)	82.242 (17.758)	96.114 (3.886)	12.23	bicubic	320 (1.0 crop)
efficientnet_b3	81.498 (18.502)	95.718 (4.282)	82.076 (17.924)	96.020 (3.980)	12.23	bicubic	300
regnet_32	82.002 (17.998)	95.906 (4.094)	19.44	bicubic	224
skresnext50d_32x4d	81.278 (18.722)	95.366 (4.634)	288 (1.0 crop)
seresnext50d_32x4d	81.266 (18.734)	95.620 (4.380)	27.6	bicubic	224
efficientnet_b2a	80.608 (19.392)	95.310 (4.690)	288 (1.0 crop)
resnet50d	80.530 (19.470)	95.160 (4.840)	25.6	bicubic	224
mixnet_xl	80.478 (19.522)	94.932 (5.068)	260
seresnet50	80.274 (19.726)	95.070 (4.930)	28.1	bicubic	224
skresnext50d_32x4d	80.156 (19.844)	94.642 (5.358)	224
cspdarknet53	80.058 (19.942)	95.084 (4.916)	27.6	bicubic	256
cspresnext50	80.040 (19.960)	94.944 (5.056)	20.6	bicubic	224
resnext50_32x4d	79.762 (20.238)	94.600 (5.400)	224
cspresnet50	79.574 (20.426)	94.712 (5.288)	21.6	bicubic	256
ese_vovnet39b	79.320 (20.680)	94.710 (5.290)	224
dpn68b	79.216 (20.784)	94.414 (5.586)	12.6	bicubic	224
resnet50	79.038 (20.962)	94.390 (5.610)	224
resnet34d	77.116 (22.884)	93.382 (6.618)	21.8	bicubic	224
seresnext26_32x4d	77.104 (22.896)	93.316 (6.684)	224
seresnet18	71.742 (28.258)	90.334 (9.666)	11.8	bicubic	224
Model	Acc@1 (Err)	Acc@5 (Err)	Param # (M)	Interpolation	Image Size
tf_efficientnet_l2_ns *tfp	88.352 (11.648)	98.652 (1.348)	480	bicubic	800
tf_efficientnet_l2_ns	TBD	TBD	480	bicubic	800
tf_efficientnet_l2_ns_475	88.234 (11.766)	98.546 (1.454)f	480	bicubic	475
tf_efficientnet_l2_ns_475 *tfp	88.172 (11.828)	98.566 (1.434)	480	bicubic	475
tf_efficientnet_b7_ns *tfp	86.844 (13.156)	98.084 (1.916)	66.35	bicubic	600
tf_efficientnet_b7_ns	86.840 (13.160)	98.094 (1.906)	66.35	bicubic	600
tf_efficientnet_b6_ns	86.452 (13.548)	97.882 (2.118)	43.04	bicubic	528
tf_efficientnet_b6_ns *tfp	86.444 (13.556)	97.880 (2.120)	43.04	bicubic	528
tf_efficientnet_b5_ns *tfp	86.064 (13.936)	97.746 (2.254)	30.39	bicubic	456
tf_efficientnet_b5_ns	86.088 (13.912)	97.752 (2.248)	30.39	bicubic	456
tf_efficientnet_b8_ap *tfp	85.436 (14.564)	97.272 (2.728)	87.4	bicubic	672
tf_efficientnet_b8 *tfp	85.384 (14.616)	97.394 (2.606)	87.4	bicubic	672
tf_efficientnet_b8	85.370 (14.630)	97.390 (2.610)	87.4	bicubic	672
tf_efficientnet_b8_ap	85.368 (14.632)	97.294 (2.706)	87.4	bicubic	672
tf_efficientnet_b4_ns *tfp	85.298 (14.702)	97.504 (2.496)	19.34	bicubic	380
tf_efficientnet_b4_ns	85.162 (14.838)	97.470 (2.530)	19.34	bicubic	380
tf_efficientnet_b7_ap *tfp	85.154 (14.846)	97.244 (2.756)	66.35	bicubic	600
tf_efficientnet_b7_ap	85.118 (14.882)	97.252 (2.748)	66.35	bicubic	600
tf_efficientnet_b7 *tfp	84.940 (15.060)	97.214 (2.786)	66.35	bicubic	600
tf_efficientnet_b7	84.932 (15.068)	97.208 (2.792)	66.35	bicubic	600
tf_efficientnet_b6_ap	84.786 (15.214)	97.138 (2.862)	43.04	bicubic	528
tf_efficientnet_b6_ap *tfp	84.760 (15.240)	97.124 (2.876)	43.04	bicubic	528
tf_efficientnet_b5_ap *tfp	84.276 (15.724)	96.932 (3.068)	30.39	bicubic	456
tf_efficientnet_b5_ap	84.254 (15.746)	96.976 (3.024)	30.39	bicubic	456
tf_efficientnet_b6 *tfp	84.140 (15.860)	96.852 (3.148)	43.04	bicubic	528
tf_efficientnet_b6	84.110 (15.890)	96.886 (3.114)	43.04	bicubic	528
tf_efficientnet_b3_ns *tfp	84.054 (15.946)	96.918 (3.082)	12.23	bicubic	300
tf_efficientnet_b3_ns	84.048 (15.952)	96.910 (3.090)	12.23	bicubic	300
tf_efficientnet_b5 *tfp	83.822 (16.178)	96.756 (3.244)	30.39	bicubic	456
tf_efficientnet_b5	83.812 (16.188)	96.748 (3.252)	30.39	bicubic	456
tf_efficientnet_b4_ap *tfp	83.278 (16.722)	96.376 (3.624)	19.34	bicubic	380
tf_efficientnet_b4_ap	83.248 (16.752)	96.388 (3.612)	19.34	bicubic	380
tf_efficientnet_b4	83.022 (16.978)	96.300 (3.700)	19.34	bicubic	380
tf_efficientnet_b4 *tfp	82.948 (17.052)	96.308 (3.692)	19.34	bicubic	380
tf_efficientnet_b2_ns *tfp	82.436 (17.564)	96.268 (3.732)	9.11	bicubic	260
tf_efficientnet_b2_ns	82.380 (17.620)	96.248 (3.752)	9.11	bicubic	260
tf_efficientnet_b3_ap *tfp	81.882 (18.118)	95.662 (4.338)	12.23	bicubic	300
tf_efficientnet_b3_ap	81.828 (18.172)	95.624 (4.376)	12.23	bicubic	300
tf_efficientnet_b3	81.636 (18.364)	95.718 (4.282)	12.23	bicubic	300
tf_efficientnet_b3 *tfp	81.576 (18.424)	95.662 (4.338)	12.23	bicubic	300
tf_efficientnet_lite4	81.528 (18.472)	95.668 (4.332)	13.00	bilinear	380
tf_efficientnet_b1_ns *tfp	81.514 (18.486)	95.776 (4.224)	7.79	bicubic	240
tf_efficientnet_lite4 *tfp	81.502 (18.498)	95.676 (4.324)	13.00	bilinear	380
tf_efficientnet_b1_ns	81.388 (18.612)	95.738 (4.262)	7.79	bicubic	240
gluon_senet154	81.224 (18.776)	95.356 (4.644)	115.09	bicubic	224
gluon_resnet152_v1s	81.012 (18.988)	95.416 (4.584)	60.32	bicubic	224
gluon_seresnext101_32x4d	80.902 (19.098)	95.294 (4.706)	48.96	bicubic	224
gluon_seresnext101_64x4d	80.890 (19.110)	95.304 (4.696)	88.23	bicubic	224
gluon_resnext101_64x4d	80.602 (19.398)	94.994 (5.006)	83.46	bicubic	224
tf_efficientnet_el	80.534 (19.466)	95.190 (4.810)	10.59	bicubic	300
tf_efficientnet_el *tfp	80.476 (19.524)	95.200 (4.800)	10.59	bicubic	300
gluon_resnet152_v1d	80.470 (19.530)	95.206 (4.794)	60.21	bicubic	224
gluon_resnet101_v1d	80.424 (19.576)	95.020 (4.980)	44.57	bicubic	224
tf_efficientnet_b2_ap *tfp	80.420 (19.580)	95.040 (4.960)	9.11	bicubic	260
gluon_resnext101_32x4d	80.334 (19.666)	94.926 (5.074)	44.18	resnet18d	72.260 (27.740)	90.696 (9.304)	11.7	bicubic	224
tf_efficientnet_b2_ap	80.306 (19.694)	95.028 (4.972)	9.11	seresnet18	71.742 (28.258)	90.334 (9.666)	11.8	bicubic	260
gluon_resnet101_v1s	80.300 (19.700)	95.150 (4.850)	44.67	bicubic	224
tf_efficientnet_b2 *tfp	80.188 (19.812)	94.974 (5.026)	9.11	bicubic	260
tf_efficientnet_b2	80.086 (19.914)	94.908 (5.092)	9.11	bicubic	260
gluon_resnet152_v1c	79.916 (20.084)	94.842 (5.158)	60.21	bicubic	224
gluon_seresnext50_32x4d	79.912 (20.088)	94.818 (5.182)	27.56	bicubic	224
tf_efficientnet_lite3	79.812 (20.188)	94.914 (5.086)	8.20	bilinear	300
tf_efficientnet_lite3 *tfp	79.734 (20.266)	94.838 (5.162)	8.20	bilinear	300
gluon_resnet152_v1b	79.692 (20.308)	94.738 (5.262)	60.19	bicubic	224
gluon_xception65	79.604 (20.396)	94.748 (5.252)	39.92	bicubic	299
gluon_resnet101_v1c	79.544 (20.456)	94.586 (5.414)	44.57	bicubic	224
tf_efficientnet_b1_ap *tfp	79.532 (20.468)	94.378 (5.622)	7.79	bicubic	240
tf_efficientnet_cc_b1_8e *tfp	79.464 (20.536)	94.492 (5.508)	39.7	bicubic	240
gluon_resnext50_32x4d	79.356 (20.644)	94.424 (5.576)	25.03	bicubic	224
gluon_resnet101_v1b	79.304 (20.696)	94.524 (5.476)	44.55	bicubic	224
tf_efficientnet_cc_b1_8e	79.298 (20.702)	94.364 (5.636)	39.7	bicubic	240
tf_efficientnet_b1_ap	79.278 (20.722)	94.308 (5.692)	7.79	bicubic	240
tf_efficientnet_b1 *tfp	79.172 (20.828)	94.450 (5.550)	7.79	bicubic	240
gluon_resnet50_v1d	79.074 (20.926)	94.476 (5.524)	25.58	bicubic	224
tf_efficientnet_em *tfp	78.958 (21.042)	94.458 (5.542)	6.90	bicubic	240
tf_mixnet_l *tfp	78.846 (21.154)	94.212 (5.788)	7.33	bilinear	224
tf_efficientnet_b1	78.826 (21.174)	94.198 (5.802)	7.79	bicubic	240
tf_efficientnet_b0_ns *tfp	78.806 (21.194)	94.496 (5.504)	5.29	bicubic	224
gluon_inception_v3	78.804 (21.196)	94.380 (5.620)	27.16M	bicubic	299
tf_mixnet_l	78.770 (21.230)	94.004 (5.996)	7.33	bicubic	224
tf_efficientnet_em	78.742 (21.258)	94.332 (5.668)	6.90	bicubic	240
gluon_resnet50_v1s	78.712 (21.288)	94.242 (5.758)	25.68	bicubic	224
tf_efficientnet_b0_ns	78.658 (21.342)	94.376 (5.624)	5.29	bicubic	224
tf_efficientnet_cc_b0_8e *tfp	78.314 (21.686)	93.790 (6.210)	24.0	bicubic	224
gluon_resnet50_v1c	78.010 (21.990)	93.988 (6.012)	25.58	bicubic	224
tf_efficientnet_cc_b0_8e	77.908 (22.092)	93.656 (6.344)	24.0	bicubic	224
tf_inception_v3	77.856 (22.144)	93.644 (6.356)	27.16M	bicubic	299
tf_efficientnet_cc_b0_4e *tfp	77.746 (22.254)	93.552 (6.448)	13.3	bicubic	224
tf_efficientnet_es *tfp	77.616 (22.384)	93.750 (6.250)	5.44	bicubic	224
gluon_resnet50_v1b	77.578 (22.422)	93.718 (6.282)	25.56	bicubic	224
adv_inception_v3	77.576 (22.424)	93.724 (6.276)	27.16M	bicubic	299
tf_efficientnet_lite2 *tfp	77.544 (22.456)	93.800 (6.200)	6.09	bilinear	260
tf_efficientnet_lite2	77.460 (22.540)	93.746 (6.254)	6.09	bicubic	260
tf_efficientnet_b0_ap *tfp	77.514 (22.486)	93.576 (6.424)	5.29	bicubic	224
tf_efficientnet_cc_b0_4e	77.304 (22.696)	93.332 (6.668)	13.3	bicubic	224
tf_efficientnet_es	77.264 (22.736)	93.600 (6.400)	5.44	bicubic	224
tf_efficientnet_b0 *tfp	77.258 (22.742)	93.478 (6.522)	5.29	bicubic	224
tf_efficientnet_b0_ap	77.084 (22.916)	93.254 (6.746)	5.29	bicubic	224
tf_mixnet_m *tfp	77.072 (22.928)	93.368 (6.632)	5.01	bilinear	224
tf_mixnet_m	76.950 (23.050)	93.156 (6.844)	5.01	bicubic	224
tf_efficientnet_b0	76.848 (23.152)	93.228 (6.772)	5.29	bicubic	224
tf_efficientnet_lite1 *tfp	76.764 (23.236)	93.326 (6.674)	5.42	bilinear	240
tf_efficientnet_lite1	76.638 (23.362)	93.232 (6.768)	5.42	bicubic	240
tf_mixnet_s *tfp	75.800 (24.200)	92.788 (7.212)	4.13	bilinear	224
tf_mobilenetv3_large_100 *tfp	75.768 (24.232)	92.710 (7.290)	5.48	bilinear	224
tf_mixnet_s	75.648 (24.352)	92.636 (7.364)	4.13	bicubic	224
tf_mobilenetv3_large_100	75.516 (24.484)	92.600 (7.400)	5.48	bilinear	224
tf_efficientnet_lite0 *tfp	75.074 (24.926)	92.314 (7.686)	4.65	bilinear	224
tf_efficientnet_lite0	74.842 (25.158)	92.170 (7.830)	4.65	bicubic	224
gluon_resnet34_v1b	74.580 (25.420)	91.988 (8.012)	21.80	bicubic	224
tf_mobilenetv3_large_075 *tfp	73.730 (26.270)	91.616 (8.384)	3.99	bilinear	224
tf_mobilenetv3_large_075	73.442 (26.558)	91.352 (8.648)	3.99	bilinear	224
tf_mobilenetv3_large_minimal_100 *tfp	72.678 (27.322)	90.860 (9.140)	3.92	bilinear	224
tf_mobilenetv3_large_minimal_100	72.244 (27.756)	90.636 (9.364)	3.92	bilinear	224
tf_mobilenetv3_small_100 *tfp	67.918 (32.082)	87.958 (12.042	2.54	bilinear	224
tf_mobilenetv3_small_100	67.918 (32.082)	87.662 (12.338)	2.54	bilinear	224
tf_mobilenetv3_small_075 *tfp	66.142 (33.858)	86.498 (13.502)	2.04	bilinear	224
tf_mobilenetv3_small_075	65.718 (34.282)	86.136 (13.864)	2.04	bilinear	224
tf_mobilenetv3_small_minimal_100 *tfp	63.378 (36.622)	84.802 (15.198)	2.04	bilinear	224
tf_mobilenetv3_small_minimal_100	62.898 (37.102)	84.230 (15.770)	2.04	bilinear	224