Merge pull request #237 from rwightman/utils_cleanup

Utils refactor, more cutmix modes / change mode arg
5 years ago · e39bf6ef59
parent 9ce42d5c5a 47a7b3b5b1
commit e39bf6ef59
14 changed files with 545 additions and 417 deletions
--- a/timm/data/mixup.py
+++ b/timm/data/mixup.py
@ -96,13 +96,13 @@ class Mixup:
        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
        prob (float): probability of applying mixup or cutmix per batch or element
        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
-        elementwise (bool): apply mixup/cutmix params per batch element instead of per batch
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
        label_smoothing (float): apply label smoothing to the mixed target tensor
        num_classes (int): number of classes for target
    """
    def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5,
-                 elementwise=False, correct_lam=True, label_smoothing=0.1, num_classes=1000):
+                 mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000):
        self.mixup_alpha = mixup_alpha
        self.cutmix_alpha = cutmix_alpha
        self.cutmix_minmax = cutmix_minmax
@ -114,7 +114,7 @@ class Mixup:
        self.switch_prob = switch_prob
        self.label_smoothing = label_smoothing
        self.num_classes = num_classes
-        self.elementwise = elementwise
+        self.mode = mode
        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)

@ -173,6 +173,26 @@ class Mixup:
                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)

+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone()  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1)
+
    def _mix_batch(self, x):
        lam, use_cutmix = self._params_per_batch()
        if lam == 1.:
@ -188,7 +208,12 @@ class Mixup:

    def __call__(self, x, target):
        assert len(x) % 2 == 0, 'Batch size should be even when using this'
-        lam = self._mix_elem(x) if self.elementwise else self._mix_batch(x)
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing)
        return x, target

@ -199,25 +224,57 @@ class FastCollateMixup(Mixup):
    A Mixup impl that's performed while collating the batches.
    """

-    def _mix_elem_collate(self, output, batch):
+    def _mix_elem_collate(self, output, batch, half=False):
        batch_size = len(batch)
-        lam_batch, use_cutmix = self._params_per_elem(batch_size)
-        for i in range(batch_size):
+        num_elem = batch_size // 2 if half else batch_size
+        assert len(output) == num_elem
+        lam_batch, use_cutmix = self._params_per_elem(num_elem)
+        for i in range(num_elem):
            j = batch_size - i - 1
            lam = lam_batch[i]
            mixed = batch[i][0]
            if lam != 1.:
                if use_cutmix[i]:
-                    mixed = mixed.copy()
+                    if not half:
+                        mixed = mixed.copy()
                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                    lam_batch[i] = lam
                else:
                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    lam_batch[i] = lam
-                    np.round(mixed, out=mixed)
+                    np.rint(mixed, out=mixed)
            output[i] += torch.from_numpy(mixed.astype(np.uint8))
+        if half:
+            lam_batch = np.concatenate((lam_batch, np.ones(num_elem)))
+        return torch.tensor(lam_batch).unsqueeze(1)
+
+    def _mix_pair_collate(self, output, batch):
+        batch_size = len(batch)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            mixed_i = batch[i][0]
+            mixed_j = batch[j][0]
+            assert 0 <= lam <= 1.0
+            if lam < 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = cutmix_bbox_and_lam(
+                        output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam)
+                    patch_i = mixed_i[:, yl:yh, xl:xh].copy()
+                    mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh]
+                    mixed_j[:, yl:yh, xl:xh] = patch_i
+                    lam_batch[i] = lam
+                else:
+                    mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam)
+                    mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam)
+                    mixed_i = mixed_temp
+                    np.rint(mixed_j, out=mixed_j)
+                    np.rint(mixed_i, out=mixed_i)
+            output[i] += torch.from_numpy(mixed_i.astype(np.uint8))
+            output[j] += torch.from_numpy(mixed_j.astype(np.uint8))
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
        return torch.tensor(lam_batch).unsqueeze(1)

    def _mix_batch_collate(self, output, batch):
@ -235,19 +292,25 @@ class FastCollateMixup(Mixup):
                    mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh]
                else:
                    mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam)
-                    np.round(mixed, out=mixed)
+                    np.rint(mixed, out=mixed)
            output[i] += torch.from_numpy(mixed.astype(np.uint8))
        return lam

    def __call__(self, batch, _=None):
        batch_size = len(batch)
        assert batch_size % 2 == 0, 'Batch size should be even when using this'
+        half = 'half' in self.mode
+        if half:
+            batch_size //= 2
        output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
-        if self.elementwise:
-            lam = self._mix_elem_collate(output, batch)
+        if self.mode == 'elem' or self.mode == 'half':
+            lam = self._mix_elem_collate(output, batch, half=half)
+        elif self.mode == 'pair':
+            lam = self._mix_pair_collate(output, batch)
        else:
            lam = self._mix_batch_collate(output, batch)
        target = torch.tensor([b[1] for b in batch], dtype=torch.int64)
        target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu')
+        target = target[:batch_size]
        return output, target

--- a/timm/utils.py
+++ b/timm/utils.py
@ -1,400 +0,0 @@
-""" Common training and validation utilities
-
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-
-from copy import deepcopy
-
-import torch
-import math
-import os
-import re
-import shutil
-import glob
-import csv
-import operator
-import logging
-import logging.handlers
-import numpy as np
-from collections import OrderedDict
-try:
-    from apex import amp
-    has_apex = True
-except ImportError:
-    amp = None
-    has_apex = False
-
-from torch import distributed as dist
-
-
-_logger = logging.getLogger(__name__)
-
-
-def unwrap_model(model):
-    if isinstance(model, ModelEma):
-        return unwrap_model(model.ema)
-    else:
-        return model.module if hasattr(model, 'module') else model
-
-
-def get_state_dict(model, unwrap_fn=unwrap_model):
-    return unwrap_fn(model).state_dict()
-
-
-class ApexScaler:
-    state_dict_key = "amp"
-
-    def __call__(self, loss, optimizer):
-        with amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        optimizer.step()
-
-    def state_dict(self):
-        if 'state_dict' in amp.__dict__:
-            return amp.state_dict()
-
-    def load_state_dict(self, state_dict):
-        if 'load_state_dict' in amp.__dict__:
-            amp.load_state_dict(state_dict)
-
-
-class NativeScaler:
-    state_dict_key = "amp_scaler"
-
-    def __init__(self):
-        self._scaler = torch.cuda.amp.GradScaler()
-
-    def __call__(self, loss, optimizer):
-        self._scaler.scale(loss).backward()
-        self._scaler.step(optimizer)
-        self._scaler.update()
-
-    def state_dict(self):
-        return self._scaler.state_dict()
-
-    def load_state_dict(self, state_dict):
-        self._scaler.load_state_dict(state_dict)
-
-
-class CheckpointSaver:
-    def __init__(
-            self,
-            model,
-            optimizer,
-            args=None,
-            model_ema=None,
-            amp_scaler=None,
-            checkpoint_prefix='checkpoint',
-            recovery_prefix='recovery',
-            checkpoint_dir='',
-            recovery_dir='',
-            decreasing=False,
-            max_history=10,
-            unwrap_fn=unwrap_model):
-
-        # objects to save state_dicts of
-        self.model = model
-        self.optimizer = optimizer
-        self.args = args
-        self.model_ema = model_ema
-        self.amp_scaler = amp_scaler
-
-        # state
-        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
-        self.best_epoch = None
-        self.best_metric = None
-        self.curr_recovery_file = ''
-        self.last_recovery_file = ''
-
-        # config
-        self.checkpoint_dir = checkpoint_dir
-        self.recovery_dir = recovery_dir
-        self.save_prefix = checkpoint_prefix
-        self.recovery_prefix = recovery_prefix
-        self.extension = '.pth.tar'
-        self.decreasing = decreasing  # a lower metric is better if True
-        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
-        self.max_history = max_history
-        self.unwrap_fn = unwrap_fn
-        assert self.max_history >= 1
-
-    def save_checkpoint(self, epoch, metric=None):
-        assert epoch >= 0
-        tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
-        last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
-        self._save(tmp_save_path, epoch, metric)
-        if os.path.exists(last_save_path):
-            os.unlink(last_save_path) # required for Windows support.
-        os.rename(tmp_save_path, last_save_path)
-        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
-        if (len(self.checkpoint_files) < self.max_history
-                or metric is None or self.cmp(metric, worst_file[1])):
-            if len(self.checkpoint_files) >= self.max_history:
-                self._cleanup_checkpoints(1)
-            filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
-            save_path = os.path.join(self.checkpoint_dir, filename)
-            os.link(last_save_path, save_path)
-            self.checkpoint_files.append((save_path, metric))
-            self.checkpoint_files = sorted(
-                self.checkpoint_files, key=lambda x: x[1],
-                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
-
-            checkpoints_str = "Current checkpoints:\n"
-            for c in self.checkpoint_files:
-                checkpoints_str += ' {}\n'.format(c)
-            _logger.info(checkpoints_str)
-
-            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
-                self.best_epoch = epoch
-                self.best_metric = metric
-                best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
-                if os.path.exists(best_save_path):
-                    os.unlink(best_save_path)
-                os.link(last_save_path, best_save_path)
-
-        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
-
-    def _save(self, save_path, epoch, metric=None):
-        save_state = {
-            'epoch': epoch,
-            'arch': type(self.model).__name__.lower(),
-            'state_dict': get_state_dict(self.model, self.unwrap_fn),
-            'optimizer': self.optimizer.state_dict(),
-            'version': 2,  # version < 2 increments epoch before save
-        }
-        if self.args is not None:
-            save_state['arch'] = self.args.model
-            save_state['args'] = self.args
-        if self.amp_scaler is not None:
-            save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict()
-        if self.model_ema is not None:
-            save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
-        if metric is not None:
-            save_state['metric'] = metric
-        torch.save(save_state, save_path)
-
-    def _cleanup_checkpoints(self, trim=0):
-        trim = min(len(self.checkpoint_files), trim)
-        delete_index = self.max_history - trim
-        if delete_index <= 0 or len(self.checkpoint_files) <= delete_index:
-            return
-        to_delete = self.checkpoint_files[delete_index:]
-        for d in to_delete:
-            try:
-                _logger.debug("Cleaning checkpoint: {}".format(d))
-                os.remove(d[0])
-            except Exception as e:
-                _logger.error("Exception '{}' while deleting checkpoint".format(e))
-        self.checkpoint_files = self.checkpoint_files[:delete_index]
-
-    def save_recovery(self, epoch, batch_idx=0):
-        assert epoch >= 0
-        filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
-        save_path = os.path.join(self.recovery_dir, filename)
-        self._save(save_path, epoch)
-        if os.path.exists(self.last_recovery_file):
-            try:
-                _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
-                os.remove(self.last_recovery_file)
-            except Exception as e:
-                _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
-        self.last_recovery_file = self.curr_recovery_file
-        self.curr_recovery_file = save_path
-
-    def find_recovery(self):
-        recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
-        files = glob.glob(recovery_path + '*' + self.extension)
-        files = sorted(files)
-        if len(files):
-            return files[0]
-        else:
-            return ''
-
-
-class AverageMeter:
-    """Computes and stores the average and current value"""
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    maxk = max(topk)
-    batch_size = target.size(0)
-    _, pred = output.topk(maxk, 1, True, True)
-    pred = pred.t()
-    correct = pred.eq(target.view(1, -1).expand_as(pred))
-    return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk]
-
-
-def get_outdir(path, *paths, inc=False):
-    outdir = os.path.join(path, *paths)
-    if not os.path.exists(outdir):
-        os.makedirs(outdir)
-    elif inc:
-        count = 1
-        outdir_inc = outdir + '-' + str(count)
-        while os.path.exists(outdir_inc):
-            count = count + 1
-            outdir_inc = outdir + '-' + str(count)
-            assert count < 100
-        outdir = outdir_inc
-        os.makedirs(outdir)
-    return outdir
-
-
-def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
-    rowd = OrderedDict(epoch=epoch)
-    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
-    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
-    with open(filename, mode='a') as cf:
-        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
-        if write_header:  # first iteration (epoch == 1 can't be used)
-            dw.writeheader()
-        dw.writerow(rowd)
-
-
-def natural_key(string_):
-    """See http://www.codinghorror.com/blog/archives/001018.html"""
-    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
-
-
-def reduce_tensor(tensor, n):
-    rt = tensor.clone()
-    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
-    rt /= n
-    return rt
-
-
-def distribute_bn(model, world_size, reduce=False):
-    # ensure every node has the same running bn stats
-    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
-        if ('running_mean' in bn_name) or ('running_var' in bn_name):
-            if reduce:
-                # average bn stats across whole group
-                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
-                bn_buf /= float(world_size)
-            else:
-                # broadcast bn stats from rank 0 to whole group
-                torch.distributed.broadcast(bn_buf, 0)
-
-
-class ModelEma:
-    """ Model Exponential Moving Average
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-
-    This is intended to allow functionality like
-    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-
-    A smoothed version of the weights is necessary for some training schemes to perform well.
-    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
-    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
-    smoothing of weights to match results. Pay attention to the decay constant you are using
-    relative to your update count per epoch.
-
-    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
-    disable validation of the EMA weights. Validation will have to be done manually in a separate
-    process, or after the training stops converging.
-
-    This class is sensitive where it is initialized in the sequence of model init,
-    GPU assignment and distributed training wrappers.
-    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
-    """
-    def __init__(self, model, decay=0.9999, device='', resume=''):
-        # make a copy of the model for accumulating moving average of weights
-        self.ema = deepcopy(model)
-        self.ema.eval()
-        self.decay = decay
-        self.device = device  # perform ema on different device from model if set
-        if device:
-            self.ema.to(device=device)
-        self.ema_has_module = hasattr(self.ema, 'module')
-        if resume:
-            self._load_checkpoint(resume)
-        for p in self.ema.parameters():
-            p.requires_grad_(False)
-
-    def _load_checkpoint(self, checkpoint_path):
-        checkpoint = torch.load(checkpoint_path, map_location='cpu')
-        assert isinstance(checkpoint, dict)
-        if 'state_dict_ema' in checkpoint:
-            new_state_dict = OrderedDict()
-            for k, v in checkpoint['state_dict_ema'].items():
-                # ema model may have been wrapped by DataParallel, and need module prefix
-                if self.ema_has_module:
-                    name = 'module.' + k if not k.startswith('module') else k
-                else:
-                    name = k
-                new_state_dict[name] = v
-            self.ema.load_state_dict(new_state_dict)
-            _logger.info("Loaded state_dict_ema")
-        else:
-            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
-
-    def update(self, model):
-        # correct a mismatch in state dict keys
-        needs_module = hasattr(model, 'module') and not self.ema_has_module
-        with torch.no_grad():
-            msd = model.state_dict()
-            for k, ema_v in self.ema.state_dict().items():
-                if needs_module:
-                    k = 'module.' + k
-                model_v = msd[k].detach()
-                if self.device:
-                    model_v = model_v.to(device=self.device)
-                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
-
-
-class FormatterNoInfo(logging.Formatter):
-    def __init__(self, fmt='%(levelname)s: %(message)s'):
-        logging.Formatter.__init__(self, fmt)
-
-    def format(self, record):
-        if record.levelno == logging.INFO:
-            return str(record.getMessage())
-        return logging.Formatter.format(self, record)
-
-
-def setup_default_logging(default_level=logging.INFO, log_path=''):
-    console_handler = logging.StreamHandler()
-    console_handler.setFormatter(FormatterNoInfo())
-    logging.root.addHandler(console_handler)
-    logging.root.setLevel(default_level)
-    if log_path:
-        file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3)
-        file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s")
-        file_handler.setFormatter(file_formatter)
-        logging.root.addHandler(file_handler)
-
-
-def add_bool_arg(parser, name, default=False, help=''):
-    dest_name = name.replace('-', '_')
-    group = parser.add_mutually_exclusive_group(required=False)
-    group.add_argument('--' + name, dest=dest_name, action='store_true', help=help)
-    group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help)
-    parser.set_defaults(**{dest_name: default})
-
-
-def set_jit_legacy():
-    """ Set JIT executor to legacy w/ support for op fusion
-    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
-    in the JIT exectutor. These API are not supported so could change.
-    """
-    #
-    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-    #torch._C._jit_set_texpr_fuser_enabled(True)
--- a/timm/utils/init.py
+++ b/timm/utils/init.py
@ -0,0 +1,10 @@
+from .checkpoint_saver import CheckpointSaver
+from .cuda import ApexScaler, NativeScaler
+from .distributed import distribute_bn, reduce_tensor
+from .jit import set_jit_legacy
+from .log import setup_default_logging, FormatterNoInfo
+from .metrics import AverageMeter, accuracy
+from .misc import natural_key, add_bool_arg
+from .model import unwrap_model, get_state_dict
+from .model_ema import ModelEma
+from .summary import update_summary, get_outdir
--- a/timm/utils/checkpoint_saver.py
+++ b/timm/utils/checkpoint_saver.py
@ -0,0 +1,153 @@
+""" Checkpoint Saver
+
+Track top-n training checkpoints and maintain recovery checkpoints on specified intervals.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import glob
+import operator
+import os
+import logging
+
+import torch
+
+from .model import unwrap_model, get_state_dict
+
+
+_logger = logging.getLogger(__name__)
+
+
+class CheckpointSaver:
+    def __init__(
+            self,
+            model,
+            optimizer,
+            args=None,
+            model_ema=None,
+            amp_scaler=None,
+            checkpoint_prefix='checkpoint',
+            recovery_prefix='recovery',
+            checkpoint_dir='',
+            recovery_dir='',
+            decreasing=False,
+            max_history=10,
+            unwrap_fn=unwrap_model):
+
+        # objects to save state_dicts of
+        self.model = model
+        self.optimizer = optimizer
+        self.args = args
+        self.model_ema = model_ema
+        self.amp_scaler = amp_scaler
+
+        # state
+        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
+        self.best_epoch = None
+        self.best_metric = None
+        self.curr_recovery_file = ''
+        self.last_recovery_file = ''
+
+        # config
+        self.checkpoint_dir = checkpoint_dir
+        self.recovery_dir = recovery_dir
+        self.save_prefix = checkpoint_prefix
+        self.recovery_prefix = recovery_prefix
+        self.extension = '.pth.tar'
+        self.decreasing = decreasing  # a lower metric is better if True
+        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
+        self.max_history = max_history
+        self.unwrap_fn = unwrap_fn
+        assert self.max_history >= 1
+
+    def save_checkpoint(self, epoch, metric=None):
+        assert epoch >= 0
+        tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
+        last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
+        self._save(tmp_save_path, epoch, metric)
+        if os.path.exists(last_save_path):
+            os.unlink(last_save_path) # required for Windows support.
+        os.rename(tmp_save_path, last_save_path)
+        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
+        if (len(self.checkpoint_files) < self.max_history
+                or metric is None or self.cmp(metric, worst_file[1])):
+            if len(self.checkpoint_files) >= self.max_history:
+                self._cleanup_checkpoints(1)
+            filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
+            save_path = os.path.join(self.checkpoint_dir, filename)
+            os.link(last_save_path, save_path)
+            self.checkpoint_files.append((save_path, metric))
+            self.checkpoint_files = sorted(
+                self.checkpoint_files, key=lambda x: x[1],
+                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
+
+            checkpoints_str = "Current checkpoints:\n"
+            for c in self.checkpoint_files:
+                checkpoints_str += ' {}\n'.format(c)
+            _logger.info(checkpoints_str)
+
+            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
+                self.best_epoch = epoch
+                self.best_metric = metric
+                best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
+                if os.path.exists(best_save_path):
+                    os.unlink(best_save_path)
+                os.link(last_save_path, best_save_path)
+
+        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
+
+    def _save(self, save_path, epoch, metric=None):
+        save_state = {
+            'epoch': epoch,
+            'arch': type(self.model).__name__.lower(),
+            'state_dict': get_state_dict(self.model, self.unwrap_fn),
+            'optimizer': self.optimizer.state_dict(),
+            'version': 2,  # version < 2 increments epoch before save
+        }
+        if self.args is not None:
+            save_state['arch'] = self.args.model
+            save_state['args'] = self.args
+        if self.amp_scaler is not None:
+            save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict()
+        if self.model_ema is not None:
+            save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
+        if metric is not None:
+            save_state['metric'] = metric
+        torch.save(save_state, save_path)
+
+    def _cleanup_checkpoints(self, trim=0):
+        trim = min(len(self.checkpoint_files), trim)
+        delete_index = self.max_history - trim
+        if delete_index <= 0 or len(self.checkpoint_files) <= delete_index:
+            return
+        to_delete = self.checkpoint_files[delete_index:]
+        for d in to_delete:
+            try:
+                _logger.debug("Cleaning checkpoint: {}".format(d))
+                os.remove(d[0])
+            except Exception as e:
+                _logger.error("Exception '{}' while deleting checkpoint".format(e))
+        self.checkpoint_files = self.checkpoint_files[:delete_index]
+
+    def save_recovery(self, epoch, batch_idx=0):
+        assert epoch >= 0
+        filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
+        save_path = os.path.join(self.recovery_dir, filename)
+        self._save(save_path, epoch)
+        if os.path.exists(self.last_recovery_file):
+            try:
+                _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
+                os.remove(self.last_recovery_file)
+            except Exception as e:
+                _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
+        self.last_recovery_file = self.curr_recovery_file
+        self.curr_recovery_file = save_path
+
+    def find_recovery(self):
+        recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
+        files = glob.glob(recovery_path + '*' + self.extension)
+        files = sorted(files)
+        if len(files):
+            return files[0]
+        else:
+            return ''
--- a/timm/utils/cuda.py
+++ b/timm/utils/cuda.py
@ -0,0 +1,47 @@
+""" CUDA / AMP utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+
+try:
+    from apex import amp
+    has_apex = True
+except ImportError:
+    amp = None
+    has_apex = False
+
+
+class ApexScaler:
+    state_dict_key = "amp"
+
+    def __call__(self, loss, optimizer):
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        optimizer.step()
+
+    def state_dict(self):
+        if 'state_dict' in amp.__dict__:
+            return amp.state_dict()
+
+    def load_state_dict(self, state_dict):
+        if 'load_state_dict' in amp.__dict__:
+            amp.load_state_dict(state_dict)
+
+
+class NativeScaler:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self):
+        self._scaler = torch.cuda.amp.GradScaler()
+
+    def __call__(self, loss, optimizer):
+        self._scaler.scale(loss).backward()
+        self._scaler.step(optimizer)
+        self._scaler.update()
+
+    def state_dict(self):
+        return self._scaler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
--- a/timm/utils/distributed.py
+++ b/timm/utils/distributed.py
@ -0,0 +1,28 @@
+""" Distributed training/validation utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+from torch import distributed as dist
+
+from .model import unwrap_model
+
+
+def reduce_tensor(tensor, n):
+    rt = tensor.clone()
+    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
+    rt /= n
+    return rt
+
+
+def distribute_bn(model, world_size, reduce=False):
+    # ensure every node has the same running bn stats
+    for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True):
+        if ('running_mean' in bn_name) or ('running_var' in bn_name):
+            if reduce:
+                # average bn stats across whole group
+                torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
+                bn_buf /= float(world_size)
+            else:
+                # broadcast bn stats from rank 0 to whole group
+                torch.distributed.broadcast(bn_buf, 0)
--- a/timm/utils/jit.py
+++ b/timm/utils/jit.py
@ -0,0 +1,18 @@
+""" JIT scripting/tracing utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+
+
+def set_jit_legacy():
+    """ Set JIT executor to legacy w/ support for op fusion
+    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
+    in the JIT exectutor. These API are not supported so could change.
+    """
+    #
+    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+    #torch._C._jit_set_texpr_fuser_enabled(True)
--- a/timm/utils/log.py
+++ b/timm/utils/log.py
@ -0,0 +1,28 @@
+""" Logging helpers
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import logging
+import logging.handlers
+
+
+class FormatterNoInfo(logging.Formatter):
+    def __init__(self, fmt='%(levelname)s: %(message)s'):
+        logging.Formatter.__init__(self, fmt)
+
+    def format(self, record):
+        if record.levelno == logging.INFO:
+            return str(record.getMessage())
+        return logging.Formatter.format(self, record)
+
+
+def setup_default_logging(default_level=logging.INFO, log_path=''):
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(FormatterNoInfo())
+    logging.root.addHandler(console_handler)
+    logging.root.setLevel(default_level)
+    if log_path:
+        file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3)
+        file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s")
+        file_handler.setFormatter(file_formatter)
+        logging.root.addHandler(file_handler)
--- a/timm/utils/metrics.py
+++ b/timm/utils/metrics.py
@ -0,0 +1,32 @@
+""" Eval metrics and related
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+
+class AverageMeter:
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+    return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk]
--- a/timm/utils/misc.py
+++ b/timm/utils/misc.py
@ -0,0 +1,18 @@
+""" Misc utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import re
+
+
+def natural_key(string_):
+    """See http://www.codinghorror.com/blog/archives/001018.html"""
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+
+
+def add_bool_arg(parser, name, default=False, help=''):
+    dest_name = name.replace('-', '_')
+    group = parser.add_mutually_exclusive_group(required=False)
+    group.add_argument('--' + name, dest=dest_name, action='store_true', help=help)
+    group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help)
+    parser.set_defaults(**{dest_name: default})
--- a/timm/utils/model.py
+++ b/timm/utils/model.py
@ -0,0 +1,16 @@
+""" Model / state_dict utils
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from .model_ema import ModelEma
+
+
+def unwrap_model(model):
+    if isinstance(model, ModelEma):
+        return unwrap_model(model.ema)
+    else:
+        return model.module if hasattr(model, 'module') else model
+
+
+def get_state_dict(model, unwrap_fn=unwrap_model):
+    return unwrap_fn(model).state_dict()
--- a/timm/utils/model_ema.py
+++ b/timm/utils/model_ema.py
@ -0,0 +1,77 @@
+""" Exponential Moving Average (EMA) of model updates
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import logging
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+
+_logger = logging.getLogger(__name__)
+
+
+class ModelEma:
+    """ Model Exponential Moving Average
+    Keep a moving average of everything in the model state_dict (parameters and buffers).
+
+    This is intended to allow functionality like
+    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
+
+    A smoothed version of the weights is necessary for some training schemes to perform well.
+    E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use
+    RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA
+    smoothing of weights to match results. Pay attention to the decay constant you are using
+    relative to your update count per epoch.
+
+    To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but
+    disable validation of the EMA weights. Validation will have to be done manually in a separate
+    process, or after the training stops converging.
+
+    This class is sensitive where it is initialized in the sequence of model init,
+    GPU assignment and distributed training wrappers.
+    I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU.
+    """
+    def __init__(self, model, decay=0.9999, device='', resume=''):
+        # make a copy of the model for accumulating moving average of weights
+        self.ema = deepcopy(model)
+        self.ema.eval()
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if device:
+            self.ema.to(device=device)
+        self.ema_has_module = hasattr(self.ema, 'module')
+        if resume:
+            self._load_checkpoint(resume)
+        for p in self.ema.parameters():
+            p.requires_grad_(False)
+
+    def _load_checkpoint(self, checkpoint_path):
+        checkpoint = torch.load(checkpoint_path, map_location='cpu')
+        assert isinstance(checkpoint, dict)
+        if 'state_dict_ema' in checkpoint:
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint['state_dict_ema'].items():
+                # ema model may have been wrapped by DataParallel, and need module prefix
+                if self.ema_has_module:
+                    name = 'module.' + k if not k.startswith('module') else k
+                else:
+                    name = k
+                new_state_dict[name] = v
+            self.ema.load_state_dict(new_state_dict)
+            _logger.info("Loaded state_dict_ema")
+        else:
+            _logger.warning("Failed to find state_dict_ema, starting from loaded model weights")
+
+    def update(self, model):
+        # correct a mismatch in state dict keys
+        needs_module = hasattr(model, 'module') and not self.ema_has_module
+        with torch.no_grad():
+            msd = model.state_dict()
+            for k, ema_v in self.ema.state_dict().items():
+                if needs_module:
+                    k = 'module.' + k
+                model_v = msd[k].detach()
+                if self.device:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v)
--- a/timm/utils/summary.py
+++ b/timm/utils/summary.py
@ -0,0 +1,34 @@
+""" Summary utilities
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import csv
+import os
+from collections import OrderedDict
+
+
+def get_outdir(path, *paths, inc=False):
+    outdir = os.path.join(path, *paths)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    elif inc:
+        count = 1
+        outdir_inc = outdir + '-' + str(count)
+        while os.path.exists(outdir_inc):
+            count = count + 1
+            outdir_inc = outdir + '-' + str(count)
+            assert count < 100
+        outdir = outdir_inc
+        os.makedirs(outdir)
+    return outdir
+
+
+def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
+    rowd = OrderedDict(epoch=epoch)
+    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
+    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
+    with open(filename, mode='a') as cf:
+        dw = csv.DictWriter(cf, fieldnames=rowd.keys())
+        if write_header:  # first iteration (epoch == 1 can't be used)
+            dw.writeheader()
+        dw.writerow(rowd)
--- a/train.py
+++ b/train.py
@ -17,9 +17,13 @@ Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman)
 import argparse
 import time
 import yaml
-from datetime import datetime
+import os
+import logging
+from collections import OrderedDict
 from contextlib import suppress
+from datetime import datetime

+import torch
 import torch.nn as nn
 import torchvision.utils
 from torch.nn.parallel import DistributedDataParallel as NativeDDP
@ -172,8 +176,8 @@ parser.add_argument('--mixup-prob', type=float, default=1.0,
                    help='Probability of performing mixup or cutmix when either/both is enabled')
 parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
                    help='Probability of switching to cutmix when both mixup and cutmix enabled')
-parser.add_argument('--mixup-elem', action='store_true', default=False,
-                    help='Apply mixup/cutmix params uniquely per batch element instead of per batch.')
+parser.add_argument('--mixup-mode', type=str, default='batch',
+                    help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
 parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
                    help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
 parser.add_argument('--smoothing', type=float, default=0.1,
@ -440,7 +444,7 @@ def main():
    if mixup_active:
        mixup_args = dict(
            mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax,
-            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, elementwise=args.mixup_elem,
+            prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode,
            label_smoothing=args.smoothing, num_classes=args.num_classes)
        if args.prefetcher:
            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)