From 5ccf682a8fe076b89e673ae75430814a6fe70bce Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Thu, 6 Jan 2022 18:08:39 -0800
Subject: [PATCH] Remove deprecated bn-tf train arg and create_model handler.
 Add evos/evob models back into fx test filter until norm_norm_norm branch
 merged.

---
 docs/training_hparam_examples.md    | 10 +++++-----
 tests/test_models.py                |  1 +
 timm/models/efficientnet_builder.py |  2 +-
 timm/models/factory.py              |  7 -------
 train.py                            |  3 ---
 5 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/docs/training_hparam_examples.md b/docs/training_hparam_examples.md
index c2afc2b1..74b9790e 100644
--- a/docs/training_hparam_examples.md
+++ b/docs/training_hparam_examples.md
@@ -3,12 +3,12 @@
 ## EfficientNet-B2 with RandAugment - 80.4 top-1, 95.1 top-5
 These params are for dual Titan RTX cards with NVIDIA Apex installed:
 
-`./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
+`./distributed_train.sh 2 /imagenet/ --model efficientnet_b2 -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-path 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .016`
 
 ## MixNet-XL with RandAugment - 80.5 top-1, 94.9 top-5
 This params are for dual Titan RTX cards with NVIDIA Apex installed:
 
-`./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce`
+`./distributed_train.sh 2 /imagenet/ --model mixnet_xl -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .969 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.3 --drop-path 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.3 --amp --lr .016 --dist-bn reduce`
 
 ## SE-ResNeXt-26-D and SE-ResNeXt-26-T
 These hparams (or similar) work well for a wide range of ResNet architecture, generally a good idea to increase the epoch # as the model size increases... ie approx 180-200 for ResNe(X)t50, and 220+ for larger. Increase batch size and LR proportionally for better GPUs or with AMP enabled. These params were for 2 1080Ti cards:
@@ -21,7 +21,7 @@ The training of this model started with the same command line as EfficientNet-B2
 ## EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5
 [Michael Klachko](https://github.com/michaelklachko) achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2.
 
-`./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048`
+`./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-path 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048`
 
 ## ResNet50 with JSD loss and RandAugment (clean + 2x RA augs) - 79.04 top-1, 94.39 top-5
 
@@ -32,11 +32,11 @@ Trained on two older 1080Ti cards, this took a while. Only slightly, non statist
 ## EfficientNet-ES (EdgeTPU-Small) with RandAugment - 78.066 top-1, 93.926 top-5
 Trained by [Andrew Lavin](https://github.com/andravin) with 8 V100 cards. Model EMA was not used, final checkpoint is the average of 8 best checkpoints during training.
 
-`./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2  --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064`
+`./distributed_train.sh 8 /imagenet --model efficientnet_es -b 128 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-path 0.2  --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064`
 
 ## MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5
 
-`./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9`
+`./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-path 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9`
 
 
 ## ResNeXt-50 32x4d w/ RandAugment - 79.762 top-1, 94.60 top-5
diff --git a/tests/test_models.py b/tests/test_models.py
index 4f80612f..035db19e 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -427,6 +427,7 @@ if 'GITHUB_ACTIONS' not in os.environ:
         'deit_*_distilled_patch16_224',
         'levit*',
         'pit_*_distilled_224',
+        '*evob', '*evos',  # until norm_norm_norm branch is merged
     ] + EXCLUDE_FX_FILTERS
 
 
diff --git a/timm/models/efficientnet_builder.py b/timm/models/efficientnet_builder.py
index a23e8273..7a05c890 100644
--- a/timm/models/efficientnet_builder.py
+++ b/timm/models/efficientnet_builder.py
@@ -40,7 +40,7 @@ def get_bn_args_tf():
 
 
 def resolve_bn_args(kwargs):
-    bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
+    bn_args = {}
     bn_momentum = kwargs.pop('bn_momentum', None)
     if bn_momentum is not None:
         bn_args['momentum'] = bn_momentum
diff --git a/timm/models/factory.py b/timm/models/factory.py
index d040a9ff..6d3fd982 100644
--- a/timm/models/factory.py
+++ b/timm/models/factory.py
@@ -47,13 +47,6 @@ def create_model(
     """
     source_name, model_name = split_model_name(model_name)
 
-    # Only EfficientNet and MobileNetV3 models have support for batchnorm params or drop_connect_rate passed as args
-    is_efficientnet = is_model_in_modules(model_name, ['efficientnet', 'mobilenetv3'])
-    if not is_efficientnet:
-        kwargs.pop('bn_tf', None)
-        kwargs.pop('bn_momentum', None)
-        kwargs.pop('bn_eps', None)
-
     # handle backwards compat with drop_connect -> drop_path change
     drop_connect_rate = kwargs.pop('drop_connect_rate', None)
     if drop_connect_rate is not None and kwargs.get('drop_path_rate', None) is None:
diff --git a/train.py b/train.py
index 10d839be..6e3b058b 100755
--- a/train.py
+++ b/train.py
@@ -234,8 +234,6 @@ parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
                     help='Drop block rate (default: None)')
 
 # Batch norm parameters (only works with gen_efficientnet based models currently)
-parser.add_argument('--bn-tf', action='store_true', default=False,
-                    help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
 parser.add_argument('--bn-momentum', type=float, default=None,
                     help='BatchNorm momentum override (if not None)')
 parser.add_argument('--bn-eps', type=float, default=None,
@@ -375,7 +373,6 @@ def main():
         drop_path_rate=args.drop_path,
         drop_block_rate=args.drop_block,
         global_pool=args.gp,
-        bn_tf=args.bn_tf,
         bn_momentum=args.bn_momentum,
         bn_eps=args.bn_eps,
         scriptable=args.torchscript,