From 532e3b417dba7f2b6106d8be2db647f2e4542b58 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Mon, 7 Sep 2020 13:56:00 -0700
Subject: [PATCH] Reorg of utils into separate modules

---
 timm/utils.py                  | 400 ---------------------------------
 timm/utils/__init__.py         |  10 +
 timm/utils/checkpoint_saver.py | 153 +++++++++++++
 timm/utils/cuda.py             |  47 ++++
 timm/utils/distributed.py      |  28 +++
 timm/utils/jit.py              |  18 ++
 timm/utils/log.py              |  28 +++
 timm/utils/metrics.py          |  32 +++
 timm/utils/misc.py             |  18 ++
 timm/utils/model.py            |  16 ++
 timm/utils/model_ema.py        |  77 +++++++
 timm/utils/summary.py          |  34 +++
 train.py                       |   6 +-
 13 files changed, 466 insertions(+), 401 deletions(-)
 delete mode 100644 timm/utils.py
 create mode 100644 timm/utils/__init__.py
 create mode 100644 timm/utils/checkpoint_saver.py
 create mode 100644 timm/utils/cuda.py
 create mode 100644 timm/utils/distributed.py
 create mode 100644 timm/utils/jit.py
 create mode 100644 timm/utils/log.py
 create mode 100644 timm/utils/metrics.py
 create mode 100644 timm/utils/misc.py
 create mode 100644 timm/utils/model.py
 create mode 100644 timm/utils/model_ema.py
 create mode 100644 timm/utils/summary.py

diff --git a/timm/utils.py b/timm/utils.py
deleted file mode 100644
index 94f85d84..00000000
--- a/timm/utils.py
+++ /dev/null
@@ -1,400 +0,0 @@
-""" Common training and validation utilities
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from copy import deepcopy
-
-import torch
-import math
-import os
-import re
-import shutil
-import glob
-import csv
-import operator
-import logging
-import logging.handlers
-import numpy as np
-from collections import OrderedDict
-try:
-    from apex import amp
-    has_apex = True
-except ImportError:
-    amp = None
-    has_apex = False
-
-from torch import distributed as dist
-
-
-_logger = logging.getLogger(__name__)
-
-
-def unwrap_model(model):
-    if isinstance(model, ModelEma):
-        return unwrap_model(model.ema)
-    else:
-        return model.module if hasattr(model, 'module') else model
-
-
-def get_state_dict(model, unwrap_fn=unwrap_model):
-    return unwrap_fn(model).state_dict()
-
-
-class ApexScaler:
-    state_dict_key = "amp"
-
-    def __call__(self, loss, optimizer):
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        optimizer.step()
-
-    def state_dict(self):
-        if 'state_dict' in amp.__dict__:
-            return amp.state_dict()
-
-    def load_state_dict(self, state_dict):
-        if 'load_state_dict' in amp.__dict__:
-            amp.load_state_dict(state_dict)
-
-
-class NativeScaler:
-    state_dict_key = "amp_scaler"
-
-    def __init__(self):
-        self._scaler = torch.cuda.amp.GradScaler()
-
-    def __call__(self, loss, optimizer):
-        self._scaler.scale(loss).backward()
-        self._scaler.step(optimizer)
-        self._scaler.update()
-
-    def state_dict(self):
-        return self._scaler.state_dict()
-
-    def load_state_dict(self, state_dict):
-        self._scaler.load_state_dict(state_dict)
-
-
-class CheckpointSaver:
-    def __init__(
-            self,
-            model,
-            optimizer,
-            args=None,
-            model_ema=None,
-            amp_scaler=None,
-            checkpoint_prefix='checkpoint',
-            recovery_prefix='recovery',
-            checkpoint_dir='',
-            recovery_dir='',
-            decreasing=False,
-            max_history=10,
-            unwrap_fn=unwrap_model):
-
-        # objects to save state_dicts of
-        self.model = model
-        self.optimizer = optimizer
-        self.args = args
-        self.model_ema = model_ema
-        self.amp_scaler = amp_scaler
-
-        # state
-        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
-        self.best_epoch = None
-        self.best_metric = None
-        self.curr_recovery_file = ''
-        self.last_recovery_file = ''
-
-        # config
-        self.checkpoint_dir = checkpoint_dir
-        self.recovery_dir = recovery_dir
-        self.save_prefix = checkpoint_prefix
-        self.recovery_prefix = recovery_prefix
-        self.extension = '.pth.tar'
-        self.decreasing = decreasing  # a lower metric is better if True
-        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
-        self.max_history = max_history
-        self.unwrap_fn = unwrap_fn
-        assert self.max_history >= 1
-
-    def save_checkpoint(self, epoch, metric=None):
-        assert epoch >= 0
-        tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
-        last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
-        self._save(tmp_save_path, epoch, metric)
-        if os.path.exists(last_save_path):
-            os.unlink(last_save_path) # required for Windows support.
-        os.rename(tmp_save_path, last_save_path)
-        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
-        if (len(self.checkpoint_files) < self.max_history
-                or metric is None or self.cmp(metric, worst_file[1])):
-            if len(self.checkpoint_files) >= self.max_history:
-                self._cleanup_checkpoints(1)
-            filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
-            save_path = os.path.join(self.checkpoint_dir, filename)
-            os.link(last_save_path, save_path)
-            self.checkpoint_files.append((save_path, metric))
-            self.checkpoint_files = sorted(
-                self.checkpoint_files, key=lambda x: x[1],
-                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
-
-            checkpoints_str = "Current checkpoints:\n"
-            for c in self.checkpoint_files:
-                checkpoints_str += ' {}\n'.format(c)
-            _logger.info(checkpoints_str)
-
-            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
-                self.best_epoch = epoch
-                self.best_metric = metric
-                best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
-                if os.path.exists(best_save_path):
-                    os.unlink(best_save_path)
-                os.link(last_save_path, best_save_path)
-
-        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
-
-    def _save(self, save_path, epoch, metric=None):
-        save_state = {
-            'epoch': epoch,
-            'arch': type(self.model).__name__.lower(),
-            'state_dict': get_state_dict(self.model, self.unwrap_fn),
-            'optimizer': self.optimizer.state_dict(),
-            'version': 2,  # version < 2 increments epoch before save
-        }
-        if self.args is not None:
-            save_state['arch'] = self.args.model
-            save_state['args'] = self.args
-        if self.amp_scaler is not None:
-            save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict()
-        if self.model_ema is not None:
-            save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
-        if metric is not None:
-            save_state['metric'] = metric
-        torch.save(save_state, save_path)
-
-    def _cleanup_checkpoints(self, trim=0):
-        trim = min(len(self.checkpoint_files), trim)
-        delete_index = self.max_history - trim
-        if delete_index <= 0 or len(self.checkpoint_files) <= delete_index:
-            return
-        to_delete = self.checkpoint_files[delete_index:]
-        for d in to_delete:
-            try:
-                _logger.debug("Cleaning checkpoint: {}".format(d))
-                os.remove(d[0])
-            except Exception as e:
-                _logger.error("Exception '{}' while deleting checkpoint".format(e))
-        self.checkpoint_files = self.checkpoint_files[:delete_index]
-
-    def save_recovery(self, epoch, batch_idx=0):
-        assert epoch >= 0
-        filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
-        save_path = os.path.join(self.recovery_dir, filename)
-        self._save(save_path, epoch)
-        if os.path.exists(self.last_recovery_file):
-            try:
-                _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
-                os.remove(self.last_recovery_file)
-            except Exception as e:
-                _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
-        self.last_recovery_file = self.curr_recovery_file
-        self.curr_recovery_file = save_path
-
-    def find_recovery(self):
-        recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
-        files = glob.glob(recovery_path + '*' + self.extension)
-        files = sorted(files)
-        if len(files):
-            return files[0]
-        else:
-            return ''
-
-
-class AverageMeter:
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    maxk = max(topk)
-    batch_size = target.size(0)
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-    return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk]
-
-
-def get_outdir(path, *paths, inc=False):
-    outdir = os.path.join(path, *paths)
-    if not os.path.exists(outdir):
-        os.makedirs(outdir)
-    elif inc:
-        count = 1
-        outdir_inc = outdir + '-' + str(count)
-        while os.path.exists(outdir_inc):
-            count = count + 1
-            outdir_inc = outdir + '-' + str(count)
-            assert count < 100
-        outdir = outdir_inc
-        os.makedirs(outdir)
-    return outdir
-
-
-def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
-    rowd = OrderedDict(epoch=epoch)
-    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
-    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
-    with open(filename, mode='a') as cf:
-        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
-        if write_header:  # first iteration (epoch == 1 can't be used)
-            dw.writeheader()
-        dw.writerow(rowd)
-
-
-def natural_key(string_):
-    """See http://www.codinghorror.com/blog/archives/001018.html"""
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
-
-
-def reduce_tensor(tensor, n):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= n
-    return rt
-
-
-def distribute_bn(model, world_size, reduce=False):
-    # ensure every node has the same running bn stats
-    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
-        if ('running_mean' in bn_name) or ('running_var' in bn_name):
-            if reduce:
-                # average bn stats across whole group
-                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
-                bn_buf /= float(world_size)
-            else:
-                # broadcast bn stats from rank 0 to whole group
-                torch.distributed.broadcast(bn_buf, 0)
-
-
-class ModelEma:
-    """ Model Exponential Moving Average
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-
-    This is intended to allow functionality like
-    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-
-    A smoothed version of the weights is necessary for some training schemes to perform well.
-    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
-    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
-    smoothing of weights to match results. Pay attention to the decay constant you are using
-    relative to your update count per epoch.
-
-    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
-    disable validation of the EMA weights. Validation will have to be done manually in a separate
-    process, or after the training stops converging.
-
-    This class is sensitive where it is initialized in the sequence of model init,
-    GPU assignment and distributed training wrappers.
-    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
-    """
-    def __init__(self, model, decay=0.9999, device='', resume=''):
-        # make a copy of the model for accumulating moving average of weights
-        self.ema = deepcopy(model)
-        self.ema.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if device:
-            self.ema.to(device=device)
-        self.ema_has_module = hasattr(self.ema, 'module')
-        if resume:
-            self._load_checkpoint(resume)
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-
-    def _load_checkpoint(self, checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location='cpu')
-        assert isinstance(checkpoint, dict)
-        if 'state_dict_ema' in checkpoint:
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint['state_dict_ema'].items():
-                # ema model may have been wrapped by DataParallel, and need module prefix
-                if self.ema_has_module:
-                    name = 'module.' + k if not k.startswith('module') else k
-                else:
-                    name = k
-                new_state_dict[name] = v
-            self.ema.load_state_dict(new_state_dict)
-            _logger.info("Loaded state_dict_ema")
-        else:
-            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
-
-    def update(self, model):
-        # correct a mismatch in state dict keys
-        needs_module = hasattr(model, 'module') and not self.ema_has_module
-        with torch.no_grad():
-            msd = model.state_dict()
-            for k, ema_v in self.ema.state_dict().items():
-                if needs_module:
-                    k = 'module.' + k
-                model_v = msd[k].detach()
-                if self.device:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
-
-
-class FormatterNoInfo(logging.Formatter):
-    def __init__(self, fmt='%(levelname)s: %(message)s'):
-        logging.Formatter.__init__(self, fmt)
-
-    def format(self, record):
-        if record.levelno == logging.INFO:
-            return str(record.getMessage())
-        return logging.Formatter.format(self, record)
-
-
-def setup_default_logging(default_level=logging.INFO, log_path=''):
-    console_handler = logging.StreamHandler()
-    console_handler.setFormatter(FormatterNoInfo())
-    logging.root.addHandler(console_handler)
-    logging.root.setLevel(default_level)
-    if log_path:
-        file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3)
-        file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s")
-        file_handler.setFormatter(file_formatter)
-        logging.root.addHandler(file_handler)
-
-
-def add_bool_arg(parser, name, default=False, help=''):
-    dest_name = name.replace('-', '_')
-    group = parser.add_mutually_exclusive_group(required=False)
-    group.add_argument('--' + name, dest=dest_name, action='store_true', help=help)
-    group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help)
-    parser.set_defaults(**{dest_name: default})
-
-
-def set_jit_legacy():
-    """ Set JIT executor to legacy w/ support for op fusion
-    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
-    in the JIT exectutor. These API are not supported so could change.
-    """
-    #
-    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-    #torch._C._jit_set_texpr_fuser_enabled(True)
diff --git a/timm/utils/__init__.py b/timm/utils/__init__.py
new file mode 100644
index 00000000..6efc2115
--- /dev/null
+++ b/timm/utils/__init__.py
@@ -0,0 +1,10 @@
+from .checkpoint_saver import CheckpointSaver
+from .cuda import ApexScaler, NativeScaler
+from .distributed import distribute_bn, reduce_tensor
+from .jit import set_jit_legacy
+from .log import setup_default_logging, FormatterNoInfo
+from .metrics import AverageMeter, accuracy
+from .misc import natural_key, add_bool_arg
+from .model import unwrap_model, get_state_dict
+from .model_ema import ModelEma
+from .summary import update_summary, get_outdir
diff --git a/timm/utils/checkpoint_saver.py b/timm/utils/checkpoint_saver.py
new file mode 100644
index 00000000..51896e78
--- /dev/null
+++ b/timm/utils/checkpoint_saver.py
@@ -0,0 +1,153 @@
+""" Checkpoint Saver
+
+Track top-n training checkpoints and maintain recovery checkpoints on specified intervals.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import glob
+import operator
+import os
+import logging
+
+import torch
+
+from .model import unwrap_model, get_state_dict
+
+
+_logger = logging.getLogger(__name__)
+
+
+class CheckpointSaver:
+    def __init__(
+            self,
+            model,
+            optimizer,
+            args=None,
+            model_ema=None,
+            amp_scaler=None,
+            checkpoint_prefix='checkpoint',
+            recovery_prefix='recovery',
+            checkpoint_dir='',
+            recovery_dir='',
+            decreasing=False,
+            max_history=10,
+            unwrap_fn=unwrap_model):
+
+        # objects to save state_dicts of
+        self.model = model
+        self.optimizer = optimizer
+        self.args = args
+        self.model_ema = model_ema
+        self.amp_scaler = amp_scaler
+
+        # state
+        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
+        self.best_epoch = None
+        self.best_metric = None
+        self.curr_recovery_file = ''
+        self.last_recovery_file = ''
+
+        # config
+        self.checkpoint_dir = checkpoint_dir
+        self.recovery_dir = recovery_dir
+        self.save_prefix = checkpoint_prefix
+        self.recovery_prefix = recovery_prefix
+        self.extension = '.pth.tar'
+        self.decreasing = decreasing  # a lower metric is better if True
+        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
+        self.max_history = max_history
+        self.unwrap_fn = unwrap_fn
+        assert self.max_history >= 1
+
+    def save_checkpoint(self, epoch, metric=None):
+        assert epoch >= 0
+        tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
+        last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
+        self._save(tmp_save_path, epoch, metric)
+        if os.path.exists(last_save_path):
+            os.unlink(last_save_path) # required for Windows support.
+        os.rename(tmp_save_path, last_save_path)
+        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
+        if (len(self.checkpoint_files) < self.max_history
+                or metric is None or self.cmp(metric, worst_file[1])):
+            if len(self.checkpoint_files) >= self.max_history:
+                self._cleanup_checkpoints(1)
+            filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
+            save_path = os.path.join(self.checkpoint_dir, filename)
+            os.link(last_save_path, save_path)
+            self.checkpoint_files.append((save_path, metric))
+            self.checkpoint_files = sorted(
+                self.checkpoint_files, key=lambda x: x[1],
+                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
+
+            checkpoints_str = "Current checkpoints:\n"
+            for c in self.checkpoint_files:
+                checkpoints_str += ' {}\n'.format(c)
+            _logger.info(checkpoints_str)
+
+            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
+                self.best_epoch = epoch
+                self.best_metric = metric
+                best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
+                if os.path.exists(best_save_path):
+                    os.unlink(best_save_path)
+                os.link(last_save_path, best_save_path)
+
+        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
+
+    def _save(self, save_path, epoch, metric=None):
+        save_state = {
+            'epoch': epoch,
+            'arch': type(self.model).__name__.lower(),
+            'state_dict': get_state_dict(self.model, self.unwrap_fn),
+            'optimizer': self.optimizer.state_dict(),
+            'version': 2,  # version < 2 increments epoch before save
+        }
+        if self.args is not None:
+            save_state['arch'] = self.args.model
+            save_state['args'] = self.args
+        if self.amp_scaler is not None:
+            save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict()
+        if self.model_ema is not None:
+            save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
+        if metric is not None:
+            save_state['metric'] = metric
+        torch.save(save_state, save_path)
+
+    def _cleanup_checkpoints(self, trim=0):
+        trim = min(len(self.checkpoint_files), trim)
+        delete_index = self.max_history - trim
+        if delete_index <= 0 or len(self.checkpoint_files) <= delete_index:
+            return
+        to_delete = self.checkpoint_files[delete_index:]
+        for d in to_delete:
+            try:
+                _logger.debug("Cleaning checkpoint: {}".format(d))
+                os.remove(d[0])
+            except Exception as e:
+                _logger.error("Exception '{}' while deleting checkpoint".format(e))
+        self.checkpoint_files = self.checkpoint_files[:delete_index]
+
+    def save_recovery(self, epoch, batch_idx=0):
+        assert epoch >= 0
+        filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
+        save_path = os.path.join(self.recovery_dir, filename)
+        self._save(save_path, epoch)
+        if os.path.exists(self.last_recovery_file):
+            try:
+                _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
+                os.remove(self.last_recovery_file)
+            except Exception as e:
+                _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
+        self.last_recovery_file = self.curr_recovery_file
+        self.curr_recovery_file = save_path
+
+    def find_recovery(self):
+        recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
+        files = glob.glob(recovery_path + '*' + self.extension)
+        files = sorted(files)
+        if len(files):
+            return files[0]
+        else:
+            return ''
diff --git a/timm/utils/cuda.py b/timm/utils/cuda.py
new file mode 100644
index 00000000..695f40b1
--- /dev/null
+++ b/timm/utils/cuda.py
@@ -0,0 +1,47 @@
+""" CUDA / AMP utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+
+try:
+    from apex import amp
+    has_apex = True
+except ImportError:
+    amp = None
+    has_apex = False
+
+
+class ApexScaler:
+    state_dict_key = "amp"
+
+    def __call__(self, loss, optimizer):
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        optimizer.step()
+
+    def state_dict(self):
+        if 'state_dict' in amp.__dict__:
+            return amp.state_dict()
+
+    def load_state_dict(self, state_dict):
+        if 'load_state_dict' in amp.__dict__:
+            amp.load_state_dict(state_dict)
+
+
+class NativeScaler:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+
+    def __call__(self, loss, optimizer):
+        self._scaler.scale(loss).backward()
+        self._scaler.step(optimizer)
+        self._scaler.update()
+
+    def state_dict(self):
+        return self._scaler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
diff --git a/timm/utils/distributed.py b/timm/utils/distributed.py
new file mode 100644
index 00000000..3c5dba8c
--- /dev/null
+++ b/timm/utils/distributed.py
@@ -0,0 +1,28 @@
+""" Distributed training/validation utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+from torch import distributed as dist
+
+from .model import unwrap_model
+
+
+def reduce_tensor(tensor, n):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= n
+    return rt
+
+
+def distribute_bn(model, world_size, reduce=False):
+    # ensure every node has the same running bn stats
+    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
+        if ('running_mean' in bn_name) or ('running_var' in bn_name):
+            if reduce:
+                # average bn stats across whole group
+                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
+                bn_buf /= float(world_size)
+            else:
+                # broadcast bn stats from rank 0 to whole group
+                torch.distributed.broadcast(bn_buf, 0)
diff --git a/timm/utils/jit.py b/timm/utils/jit.py
new file mode 100644
index 00000000..185ab7a0
--- /dev/null
+++ b/timm/utils/jit.py
@@ -0,0 +1,18 @@
+""" JIT scripting/tracing utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+
+
+def set_jit_legacy():
+    """ Set JIT executor to legacy w/ support for op fusion
+    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
+    in the JIT exectutor. These API are not supported so could change.
+    """
+    #
+    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+    #torch._C._jit_set_texpr_fuser_enabled(True)
diff --git a/timm/utils/log.py b/timm/utils/log.py
new file mode 100644
index 00000000..c99469e0
--- /dev/null
+++ b/timm/utils/log.py
@@ -0,0 +1,28 @@
+""" Logging helpers
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import logging
+import logging.handlers
+
+
+class FormatterNoInfo(logging.Formatter):
+    def __init__(self, fmt='%(levelname)s: %(message)s'):
+        logging.Formatter.__init__(self, fmt)
+
+    def format(self, record):
+        if record.levelno == logging.INFO:
+            return str(record.getMessage())
+        return logging.Formatter.format(self, record)
+
+
+def setup_default_logging(default_level=logging.INFO, log_path=''):
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(FormatterNoInfo())
+    logging.root.addHandler(console_handler)
+    logging.root.setLevel(default_level)
+    if log_path:
+        file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3)
+        file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s")
+        file_handler.setFormatter(file_formatter)
+        logging.root.addHandler(file_handler)
diff --git a/timm/utils/metrics.py b/timm/utils/metrics.py
new file mode 100644
index 00000000..08f9a9b3
--- /dev/null
+++ b/timm/utils/metrics.py
@@ -0,0 +1,32 @@
+""" Eval metrics and related
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk]
diff --git a/timm/utils/misc.py b/timm/utils/misc.py
new file mode 100644
index 00000000..39c0097c
--- /dev/null
+++ b/timm/utils/misc.py
@@ -0,0 +1,18 @@
+""" Misc utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import re
+
+
+def natural_key(string_):
+    """See http://www.codinghorror.com/blog/archives/001018.html"""
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+
+
+def add_bool_arg(parser, name, default=False, help=''):
+    dest_name = name.replace('-', '_')
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument('--' + name, dest=dest_name, action='store_true', help=help)
+    group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help)
+    parser.set_defaults(**{dest_name: default})
diff --git a/timm/utils/model.py b/timm/utils/model.py
new file mode 100644
index 00000000..cfd42806
--- /dev/null
+++ b/timm/utils/model.py
@@ -0,0 +1,16 @@
+""" Model / state_dict utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from .model_ema import ModelEma
+
+
+def unwrap_model(model):
+    if isinstance(model, ModelEma):
+        return unwrap_model(model.ema)
+    else:
+        return model.module if hasattr(model, 'module') else model
+
+
+def get_state_dict(model, unwrap_fn=unwrap_model):
+    return unwrap_fn(model).state_dict()
diff --git a/timm/utils/model_ema.py b/timm/utils/model_ema.py
new file mode 100644
index 00000000..b788b32e
--- /dev/null
+++ b/timm/utils/model_ema.py
@@ -0,0 +1,77 @@
+""" Exponential Moving Average (EMA) of model updates
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import logging
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+
+_logger = logging.getLogger(__name__)
+
+
+class ModelEma:
+    """ Model Exponential Moving Average
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
+    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
+    smoothing of weights to match results. Pay attention to the decay constant you are using
+    relative to your update count per epoch.
+
+    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
+    disable validation of the EMA weights. Validation will have to be done manually in a separate
+    process, or after the training stops converging.
+
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
+    """
+    def __init__(self, model, decay=0.9999, device='', resume=''):
+        # make a copy of the model for accumulating moving average of weights
+        self.ema = deepcopy(model)
+        self.ema.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if device:
+            self.ema.to(device=device)
+        self.ema_has_module = hasattr(self.ema, 'module')
+        if resume:
+            self._load_checkpoint(resume)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def _load_checkpoint(self, checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        assert isinstance(checkpoint, dict)
+        if 'state_dict_ema' in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['state_dict_ema'].items():
+                # ema model may have been wrapped by DataParallel, and need module prefix
+                if self.ema_has_module:
+                    name = 'module.' + k if not k.startswith('module') else k
+                else:
+                    name = k
+                new_state_dict[name] = v
+            self.ema.load_state_dict(new_state_dict)
+            _logger.info("Loaded state_dict_ema")
+        else:
+            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
+
+    def update(self, model):
+        # correct a mismatch in state dict keys
+        needs_module = hasattr(model, 'module') and not self.ema_has_module
+        with torch.no_grad():
+            msd = model.state_dict()
+            for k, ema_v in self.ema.state_dict().items():
+                if needs_module:
+                    k = 'module.' + k
+                model_v = msd[k].detach()
+                if self.device:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
diff --git a/timm/utils/summary.py b/timm/utils/summary.py
new file mode 100644
index 00000000..a0801eaa
--- /dev/null
+++ b/timm/utils/summary.py
@@ -0,0 +1,34 @@
+""" Summary utilities
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import csv
+import os
+from collections import OrderedDict
+
+
+def get_outdir(path, *paths, inc=False):
+    outdir = os.path.join(path, *paths)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    elif inc:
+        count = 1
+        outdir_inc = outdir + '-' + str(count)
+        while os.path.exists(outdir_inc):
+            count = count + 1
+            outdir_inc = outdir + '-' + str(count)
+            assert count < 100
+        outdir = outdir_inc
+        os.makedirs(outdir)
+    return outdir
+
+
+def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
+    rowd = OrderedDict(epoch=epoch)
+    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
+    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
+    with open(filename, mode='a') as cf:
+        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
+        if write_header:  # first iteration (epoch == 1 can't be used)
+            dw.writeheader()
+        dw.writerow(rowd)
diff --git a/train.py b/train.py
index 260de18b..023a51ca 100755
--- a/train.py
+++ b/train.py
@@ -17,9 +17,13 @@ Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman)
 import argparse
 import time
 import yaml
-from datetime import datetime
+import os
+import logging
+from collections import OrderedDict
 from contextlib import suppress
+from datetime import datetime
 
+import torch
 import torch.nn as nn
 import torchvision.utils
 from torch.nn.parallel import DistributedDataParallel as NativeDDP