From 532e3b417dba7f2b6106d8be2db647f2e4542b58 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 7 Sep 2020 13:56:00 -0700 Subject: [PATCH] Reorg of utils into separate modules --- timm/utils.py | 400 --------------------------------- timm/utils/__init__.py | 10 + timm/utils/checkpoint_saver.py | 153 +++++++++++++ timm/utils/cuda.py | 47 ++++ timm/utils/distributed.py | 28 +++ timm/utils/jit.py | 18 ++ timm/utils/log.py | 28 +++ timm/utils/metrics.py | 32 +++ timm/utils/misc.py | 18 ++ timm/utils/model.py | 16 ++ timm/utils/model_ema.py | 77 +++++++ timm/utils/summary.py | 34 +++ train.py | 6 +- 13 files changed, 466 insertions(+), 401 deletions(-) delete mode 100644 timm/utils.py create mode 100644 timm/utils/__init__.py create mode 100644 timm/utils/checkpoint_saver.py create mode 100644 timm/utils/cuda.py create mode 100644 timm/utils/distributed.py create mode 100644 timm/utils/jit.py create mode 100644 timm/utils/log.py create mode 100644 timm/utils/metrics.py create mode 100644 timm/utils/misc.py create mode 100644 timm/utils/model.py create mode 100644 timm/utils/model_ema.py create mode 100644 timm/utils/summary.py diff --git a/timm/utils.py b/timm/utils.py deleted file mode 100644 index 94f85d84..00000000 --- a/timm/utils.py +++ /dev/null @@ -1,400 +0,0 @@ -""" Common training and validation utilities - -Hacked together by / Copyright 2020 Ross Wightman -""" - -from copy import deepcopy - -import torch -import math -import os -import re -import shutil -import glob -import csv -import operator -import logging -import logging.handlers -import numpy as np -from collections import OrderedDict -try: - from apex import amp - has_apex = True -except ImportError: - amp = None - has_apex = False - -from torch import distributed as dist - - -_logger = logging.getLogger(__name__) - - -def unwrap_model(model): - if isinstance(model, ModelEma): - return unwrap_model(model.ema) - else: - return model.module if hasattr(model, 'module') else model - - -def get_state_dict(model, unwrap_fn=unwrap_model): - return unwrap_fn(model).state_dict() - - -class ApexScaler: - state_dict_key = "amp" - - def __call__(self, loss, optimizer): - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - optimizer.step() - - def state_dict(self): - if 'state_dict' in amp.__dict__: - return amp.state_dict() - - def load_state_dict(self, state_dict): - if 'load_state_dict' in amp.__dict__: - amp.load_state_dict(state_dict) - - -class NativeScaler: - state_dict_key = "amp_scaler" - - def __init__(self): - self._scaler = torch.cuda.amp.GradScaler() - - def __call__(self, loss, optimizer): - self._scaler.scale(loss).backward() - self._scaler.step(optimizer) - self._scaler.update() - - def state_dict(self): - return self._scaler.state_dict() - - def load_state_dict(self, state_dict): - self._scaler.load_state_dict(state_dict) - - -class CheckpointSaver: - def __init__( - self, - model, - optimizer, - args=None, - model_ema=None, - amp_scaler=None, - checkpoint_prefix='checkpoint', - recovery_prefix='recovery', - checkpoint_dir='', - recovery_dir='', - decreasing=False, - max_history=10, - unwrap_fn=unwrap_model): - - # objects to save state_dicts of - self.model = model - self.optimizer = optimizer - self.args = args - self.model_ema = model_ema - self.amp_scaler = amp_scaler - - # state - self.checkpoint_files = [] # (filename, metric) tuples in order of decreasing betterness - self.best_epoch = None - self.best_metric = None - self.curr_recovery_file = '' - self.last_recovery_file = '' - - # config - self.checkpoint_dir = checkpoint_dir - self.recovery_dir = recovery_dir - self.save_prefix = checkpoint_prefix - self.recovery_prefix = recovery_prefix - self.extension = '.pth.tar' - self.decreasing = decreasing # a lower metric is better if True - self.cmp = operator.lt if decreasing else operator.gt # True if lhs better than rhs - self.max_history = max_history - self.unwrap_fn = unwrap_fn - assert self.max_history >= 1 - - def save_checkpoint(self, epoch, metric=None): - assert epoch >= 0 - tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension) - last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension) - self._save(tmp_save_path, epoch, metric) - if os.path.exists(last_save_path): - os.unlink(last_save_path) # required for Windows support. - os.rename(tmp_save_path, last_save_path) - worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None - if (len(self.checkpoint_files) < self.max_history - or metric is None or self.cmp(metric, worst_file[1])): - if len(self.checkpoint_files) >= self.max_history: - self._cleanup_checkpoints(1) - filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension - save_path = os.path.join(self.checkpoint_dir, filename) - os.link(last_save_path, save_path) - self.checkpoint_files.append((save_path, metric)) - self.checkpoint_files = sorted( - self.checkpoint_files, key=lambda x: x[1], - reverse=not self.decreasing) # sort in descending order if a lower metric is not better - - checkpoints_str = "Current checkpoints:\n" - for c in self.checkpoint_files: - checkpoints_str += ' {}\n'.format(c) - _logger.info(checkpoints_str) - - if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)): - self.best_epoch = epoch - self.best_metric = metric - best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension) - if os.path.exists(best_save_path): - os.unlink(best_save_path) - os.link(last_save_path, best_save_path) - - return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch) - - def _save(self, save_path, epoch, metric=None): - save_state = { - 'epoch': epoch, - 'arch': type(self.model).__name__.lower(), - 'state_dict': get_state_dict(self.model, self.unwrap_fn), - 'optimizer': self.optimizer.state_dict(), - 'version': 2, # version < 2 increments epoch before save - } - if self.args is not None: - save_state['arch'] = self.args.model - save_state['args'] = self.args - if self.amp_scaler is not None: - save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict() - if self.model_ema is not None: - save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn) - if metric is not None: - save_state['metric'] = metric - torch.save(save_state, save_path) - - def _cleanup_checkpoints(self, trim=0): - trim = min(len(self.checkpoint_files), trim) - delete_index = self.max_history - trim - if delete_index <= 0 or len(self.checkpoint_files) <= delete_index: - return - to_delete = self.checkpoint_files[delete_index:] - for d in to_delete: - try: - _logger.debug("Cleaning checkpoint: {}".format(d)) - os.remove(d[0]) - except Exception as e: - _logger.error("Exception '{}' while deleting checkpoint".format(e)) - self.checkpoint_files = self.checkpoint_files[:delete_index] - - def save_recovery(self, epoch, batch_idx=0): - assert epoch >= 0 - filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension - save_path = os.path.join(self.recovery_dir, filename) - self._save(save_path, epoch) - if os.path.exists(self.last_recovery_file): - try: - _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file)) - os.remove(self.last_recovery_file) - except Exception as e: - _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file)) - self.last_recovery_file = self.curr_recovery_file - self.curr_recovery_file = save_path - - def find_recovery(self): - recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix) - files = glob.glob(recovery_path + '*' + self.extension) - files = sorted(files) - if len(files): - return files[0] - else: - return '' - - -class AverageMeter: - """Computes and stores the average and current value""" - def __init__(self): - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - -def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - maxk = max(topk) - batch_size = target.size(0) - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk] - - -def get_outdir(path, *paths, inc=False): - outdir = os.path.join(path, *paths) - if not os.path.exists(outdir): - os.makedirs(outdir) - elif inc: - count = 1 - outdir_inc = outdir + '-' + str(count) - while os.path.exists(outdir_inc): - count = count + 1 - outdir_inc = outdir + '-' + str(count) - assert count < 100 - outdir = outdir_inc - os.makedirs(outdir) - return outdir - - -def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False): - rowd = OrderedDict(epoch=epoch) - rowd.update([('train_' + k, v) for k, v in train_metrics.items()]) - rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()]) - with open(filename, mode='a') as cf: - dw = csv.DictWriter(cf, fieldnames=rowd.keys()) - if write_header: # first iteration (epoch == 1 can't be used) - dw.writeheader() - dw.writerow(rowd) - - -def natural_key(string_): - """See http://www.codinghorror.com/blog/archives/001018.html""" - return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] - - -def reduce_tensor(tensor, n): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.ReduceOp.SUM) - rt /= n - return rt - - -def distribute_bn(model, world_size, reduce=False): - # ensure every node has the same running bn stats - for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True): - if ('running_mean' in bn_name) or ('running_var' in bn_name): - if reduce: - # average bn stats across whole group - torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) - bn_buf /= float(world_size) - else: - # broadcast bn stats from rank 0 to whole group - torch.distributed.broadcast(bn_buf, 0) - - -class ModelEma: - """ Model Exponential Moving Average - Keep a moving average of everything in the model state_dict (parameters and buffers). - - This is intended to allow functionality like - https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage - - A smoothed version of the weights is necessary for some training schemes to perform well. - E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use - RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA - smoothing of weights to match results. Pay attention to the decay constant you are using - relative to your update count per epoch. - - To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but - disable validation of the EMA weights. Validation will have to be done manually in a separate - process, or after the training stops converging. - - This class is sensitive where it is initialized in the sequence of model init, - GPU assignment and distributed training wrappers. - I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU. - """ - def __init__(self, model, decay=0.9999, device='', resume=''): - # make a copy of the model for accumulating moving average of weights - self.ema = deepcopy(model) - self.ema.eval() - self.decay = decay - self.device = device # perform ema on different device from model if set - if device: - self.ema.to(device=device) - self.ema_has_module = hasattr(self.ema, 'module') - if resume: - self._load_checkpoint(resume) - for p in self.ema.parameters(): - p.requires_grad_(False) - - def _load_checkpoint(self, checkpoint_path): - checkpoint = torch.load(checkpoint_path, map_location='cpu') - assert isinstance(checkpoint, dict) - if 'state_dict_ema' in checkpoint: - new_state_dict = OrderedDict() - for k, v in checkpoint['state_dict_ema'].items(): - # ema model may have been wrapped by DataParallel, and need module prefix - if self.ema_has_module: - name = 'module.' + k if not k.startswith('module') else k - else: - name = k - new_state_dict[name] = v - self.ema.load_state_dict(new_state_dict) - _logger.info("Loaded state_dict_ema") - else: - _logger.warning("Failed to find state_dict_ema, starting from loaded model weights") - - def update(self, model): - # correct a mismatch in state dict keys - needs_module = hasattr(model, 'module') and not self.ema_has_module - with torch.no_grad(): - msd = model.state_dict() - for k, ema_v in self.ema.state_dict().items(): - if needs_module: - k = 'module.' + k - model_v = msd[k].detach() - if self.device: - model_v = model_v.to(device=self.device) - ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v) - - -class FormatterNoInfo(logging.Formatter): - def __init__(self, fmt='%(levelname)s: %(message)s'): - logging.Formatter.__init__(self, fmt) - - def format(self, record): - if record.levelno == logging.INFO: - return str(record.getMessage()) - return logging.Formatter.format(self, record) - - -def setup_default_logging(default_level=logging.INFO, log_path=''): - console_handler = logging.StreamHandler() - console_handler.setFormatter(FormatterNoInfo()) - logging.root.addHandler(console_handler) - logging.root.setLevel(default_level) - if log_path: - file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3) - file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s") - file_handler.setFormatter(file_formatter) - logging.root.addHandler(file_handler) - - -def add_bool_arg(parser, name, default=False, help=''): - dest_name = name.replace('-', '_') - group = parser.add_mutually_exclusive_group(required=False) - group.add_argument('--' + name, dest=dest_name, action='store_true', help=help) - group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help) - parser.set_defaults(**{dest_name: default}) - - -def set_jit_legacy(): - """ Set JIT executor to legacy w/ support for op fusion - This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes - in the JIT exectutor. These API are not supported so could change. - """ - # - assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!" - torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) - torch._C._jit_override_can_fuse_on_gpu(True) - #torch._C._jit_set_texpr_fuser_enabled(True) diff --git a/timm/utils/__init__.py b/timm/utils/__init__.py new file mode 100644 index 00000000..6efc2115 --- /dev/null +++ b/timm/utils/__init__.py @@ -0,0 +1,10 @@ +from .checkpoint_saver import CheckpointSaver +from .cuda import ApexScaler, NativeScaler +from .distributed import distribute_bn, reduce_tensor +from .jit import set_jit_legacy +from .log import setup_default_logging, FormatterNoInfo +from .metrics import AverageMeter, accuracy +from .misc import natural_key, add_bool_arg +from .model import unwrap_model, get_state_dict +from .model_ema import ModelEma +from .summary import update_summary, get_outdir diff --git a/timm/utils/checkpoint_saver.py b/timm/utils/checkpoint_saver.py new file mode 100644 index 00000000..51896e78 --- /dev/null +++ b/timm/utils/checkpoint_saver.py @@ -0,0 +1,153 @@ +""" Checkpoint Saver + +Track top-n training checkpoints and maintain recovery checkpoints on specified intervals. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import glob +import operator +import os +import logging + +import torch + +from .model import unwrap_model, get_state_dict + + +_logger = logging.getLogger(__name__) + + +class CheckpointSaver: + def __init__( + self, + model, + optimizer, + args=None, + model_ema=None, + amp_scaler=None, + checkpoint_prefix='checkpoint', + recovery_prefix='recovery', + checkpoint_dir='', + recovery_dir='', + decreasing=False, + max_history=10, + unwrap_fn=unwrap_model): + + # objects to save state_dicts of + self.model = model + self.optimizer = optimizer + self.args = args + self.model_ema = model_ema + self.amp_scaler = amp_scaler + + # state + self.checkpoint_files = [] # (filename, metric) tuples in order of decreasing betterness + self.best_epoch = None + self.best_metric = None + self.curr_recovery_file = '' + self.last_recovery_file = '' + + # config + self.checkpoint_dir = checkpoint_dir + self.recovery_dir = recovery_dir + self.save_prefix = checkpoint_prefix + self.recovery_prefix = recovery_prefix + self.extension = '.pth.tar' + self.decreasing = decreasing # a lower metric is better if True + self.cmp = operator.lt if decreasing else operator.gt # True if lhs better than rhs + self.max_history = max_history + self.unwrap_fn = unwrap_fn + assert self.max_history >= 1 + + def save_checkpoint(self, epoch, metric=None): + assert epoch >= 0 + tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension) + last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension) + self._save(tmp_save_path, epoch, metric) + if os.path.exists(last_save_path): + os.unlink(last_save_path) # required for Windows support. + os.rename(tmp_save_path, last_save_path) + worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None + if (len(self.checkpoint_files) < self.max_history + or metric is None or self.cmp(metric, worst_file[1])): + if len(self.checkpoint_files) >= self.max_history: + self._cleanup_checkpoints(1) + filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension + save_path = os.path.join(self.checkpoint_dir, filename) + os.link(last_save_path, save_path) + self.checkpoint_files.append((save_path, metric)) + self.checkpoint_files = sorted( + self.checkpoint_files, key=lambda x: x[1], + reverse=not self.decreasing) # sort in descending order if a lower metric is not better + + checkpoints_str = "Current checkpoints:\n" + for c in self.checkpoint_files: + checkpoints_str += ' {}\n'.format(c) + _logger.info(checkpoints_str) + + if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)): + self.best_epoch = epoch + self.best_metric = metric + best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension) + if os.path.exists(best_save_path): + os.unlink(best_save_path) + os.link(last_save_path, best_save_path) + + return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch) + + def _save(self, save_path, epoch, metric=None): + save_state = { + 'epoch': epoch, + 'arch': type(self.model).__name__.lower(), + 'state_dict': get_state_dict(self.model, self.unwrap_fn), + 'optimizer': self.optimizer.state_dict(), + 'version': 2, # version < 2 increments epoch before save + } + if self.args is not None: + save_state['arch'] = self.args.model + save_state['args'] = self.args + if self.amp_scaler is not None: + save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict() + if self.model_ema is not None: + save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn) + if metric is not None: + save_state['metric'] = metric + torch.save(save_state, save_path) + + def _cleanup_checkpoints(self, trim=0): + trim = min(len(self.checkpoint_files), trim) + delete_index = self.max_history - trim + if delete_index <= 0 or len(self.checkpoint_files) <= delete_index: + return + to_delete = self.checkpoint_files[delete_index:] + for d in to_delete: + try: + _logger.debug("Cleaning checkpoint: {}".format(d)) + os.remove(d[0]) + except Exception as e: + _logger.error("Exception '{}' while deleting checkpoint".format(e)) + self.checkpoint_files = self.checkpoint_files[:delete_index] + + def save_recovery(self, epoch, batch_idx=0): + assert epoch >= 0 + filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension + save_path = os.path.join(self.recovery_dir, filename) + self._save(save_path, epoch) + if os.path.exists(self.last_recovery_file): + try: + _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file)) + os.remove(self.last_recovery_file) + except Exception as e: + _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file)) + self.last_recovery_file = self.curr_recovery_file + self.curr_recovery_file = save_path + + def find_recovery(self): + recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix) + files = glob.glob(recovery_path + '*' + self.extension) + files = sorted(files) + if len(files): + return files[0] + else: + return '' diff --git a/timm/utils/cuda.py b/timm/utils/cuda.py new file mode 100644 index 00000000..695f40b1 --- /dev/null +++ b/timm/utils/cuda.py @@ -0,0 +1,47 @@ +""" CUDA / AMP utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch + +try: + from apex import amp + has_apex = True +except ImportError: + amp = None + has_apex = False + + +class ApexScaler: + state_dict_key = "amp" + + def __call__(self, loss, optimizer): + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + optimizer.step() + + def state_dict(self): + if 'state_dict' in amp.__dict__: + return amp.state_dict() + + def load_state_dict(self, state_dict): + if 'load_state_dict' in amp.__dict__: + amp.load_state_dict(state_dict) + + +class NativeScaler: + state_dict_key = "amp_scaler" + + def __init__(self): + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer): + self._scaler.scale(loss).backward() + self._scaler.step(optimizer) + self._scaler.update() + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) diff --git a/timm/utils/distributed.py b/timm/utils/distributed.py new file mode 100644 index 00000000..3c5dba8c --- /dev/null +++ b/timm/utils/distributed.py @@ -0,0 +1,28 @@ +""" Distributed training/validation utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +from torch import distributed as dist + +from .model import unwrap_model + + +def reduce_tensor(tensor, n): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + rt /= n + return rt + + +def distribute_bn(model, world_size, reduce=False): + # ensure every node has the same running bn stats + for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True): + if ('running_mean' in bn_name) or ('running_var' in bn_name): + if reduce: + # average bn stats across whole group + torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) + bn_buf /= float(world_size) + else: + # broadcast bn stats from rank 0 to whole group + torch.distributed.broadcast(bn_buf, 0) diff --git a/timm/utils/jit.py b/timm/utils/jit.py new file mode 100644 index 00000000..185ab7a0 --- /dev/null +++ b/timm/utils/jit.py @@ -0,0 +1,18 @@ +""" JIT scripting/tracing utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch + + +def set_jit_legacy(): + """ Set JIT executor to legacy w/ support for op fusion + This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes + in the JIT exectutor. These API are not supported so could change. + """ + # + assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!" + torch._C._jit_set_profiling_executor(False) + torch._C._jit_set_profiling_mode(False) + torch._C._jit_override_can_fuse_on_gpu(True) + #torch._C._jit_set_texpr_fuser_enabled(True) diff --git a/timm/utils/log.py b/timm/utils/log.py new file mode 100644 index 00000000..c99469e0 --- /dev/null +++ b/timm/utils/log.py @@ -0,0 +1,28 @@ +""" Logging helpers + +Hacked together by / Copyright 2020 Ross Wightman +""" +import logging +import logging.handlers + + +class FormatterNoInfo(logging.Formatter): + def __init__(self, fmt='%(levelname)s: %(message)s'): + logging.Formatter.__init__(self, fmt) + + def format(self, record): + if record.levelno == logging.INFO: + return str(record.getMessage()) + return logging.Formatter.format(self, record) + + +def setup_default_logging(default_level=logging.INFO, log_path=''): + console_handler = logging.StreamHandler() + console_handler.setFormatter(FormatterNoInfo()) + logging.root.addHandler(console_handler) + logging.root.setLevel(default_level) + if log_path: + file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3) + file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s") + file_handler.setFormatter(file_formatter) + logging.root.addHandler(file_handler) diff --git a/timm/utils/metrics.py b/timm/utils/metrics.py new file mode 100644 index 00000000..08f9a9b3 --- /dev/null +++ b/timm/utils/metrics.py @@ -0,0 +1,32 @@ +""" Eval metrics and related + +Hacked together by / Copyright 2020 Ross Wightman +""" + + +class AverageMeter: + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + maxk = max(topk) + batch_size = target.size(0) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + return [correct[:k].view(-1).float().sum(0) * 100. / batch_size for k in topk] diff --git a/timm/utils/misc.py b/timm/utils/misc.py new file mode 100644 index 00000000..39c0097c --- /dev/null +++ b/timm/utils/misc.py @@ -0,0 +1,18 @@ +""" Misc utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import re + + +def natural_key(string_): + """See http://www.codinghorror.com/blog/archives/001018.html""" + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] + + +def add_bool_arg(parser, name, default=False, help=''): + dest_name = name.replace('-', '_') + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--' + name, dest=dest_name, action='store_true', help=help) + group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help) + parser.set_defaults(**{dest_name: default}) diff --git a/timm/utils/model.py b/timm/utils/model.py new file mode 100644 index 00000000..cfd42806 --- /dev/null +++ b/timm/utils/model.py @@ -0,0 +1,16 @@ +""" Model / state_dict utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +from .model_ema import ModelEma + + +def unwrap_model(model): + if isinstance(model, ModelEma): + return unwrap_model(model.ema) + else: + return model.module if hasattr(model, 'module') else model + + +def get_state_dict(model, unwrap_fn=unwrap_model): + return unwrap_fn(model).state_dict() diff --git a/timm/utils/model_ema.py b/timm/utils/model_ema.py new file mode 100644 index 00000000..b788b32e --- /dev/null +++ b/timm/utils/model_ema.py @@ -0,0 +1,77 @@ +""" Exponential Moving Average (EMA) of model updates + +Hacked together by / Copyright 2020 Ross Wightman +""" +import logging +from collections import OrderedDict +from copy import deepcopy + +import torch + +_logger = logging.getLogger(__name__) + + +class ModelEma: + """ Model Exponential Moving Average + Keep a moving average of everything in the model state_dict (parameters and buffers). + + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + + A smoothed version of the weights is necessary for some training schemes to perform well. + E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use + RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA + smoothing of weights to match results. Pay attention to the decay constant you are using + relative to your update count per epoch. + + To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but + disable validation of the EMA weights. Validation will have to be done manually in a separate + process, or after the training stops converging. + + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + I've tested with the sequence in my own train.py for torch.DataParallel, apex.DDP, and single-GPU. + """ + def __init__(self, model, decay=0.9999, device='', resume=''): + # make a copy of the model for accumulating moving average of weights + self.ema = deepcopy(model) + self.ema.eval() + self.decay = decay + self.device = device # perform ema on different device from model if set + if device: + self.ema.to(device=device) + self.ema_has_module = hasattr(self.ema, 'module') + if resume: + self._load_checkpoint(resume) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def _load_checkpoint(self, checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location='cpu') + assert isinstance(checkpoint, dict) + if 'state_dict_ema' in checkpoint: + new_state_dict = OrderedDict() + for k, v in checkpoint['state_dict_ema'].items(): + # ema model may have been wrapped by DataParallel, and need module prefix + if self.ema_has_module: + name = 'module.' + k if not k.startswith('module') else k + else: + name = k + new_state_dict[name] = v + self.ema.load_state_dict(new_state_dict) + _logger.info("Loaded state_dict_ema") + else: + _logger.warning("Failed to find state_dict_ema, starting from loaded model weights") + + def update(self, model): + # correct a mismatch in state dict keys + needs_module = hasattr(model, 'module') and not self.ema_has_module + with torch.no_grad(): + msd = model.state_dict() + for k, ema_v in self.ema.state_dict().items(): + if needs_module: + k = 'module.' + k + model_v = msd[k].detach() + if self.device: + model_v = model_v.to(device=self.device) + ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v) diff --git a/timm/utils/summary.py b/timm/utils/summary.py new file mode 100644 index 00000000..a0801eaa --- /dev/null +++ b/timm/utils/summary.py @@ -0,0 +1,34 @@ +""" Summary utilities + +Hacked together by / Copyright 2020 Ross Wightman +""" +import csv +import os +from collections import OrderedDict + + +def get_outdir(path, *paths, inc=False): + outdir = os.path.join(path, *paths) + if not os.path.exists(outdir): + os.makedirs(outdir) + elif inc: + count = 1 + outdir_inc = outdir + '-' + str(count) + while os.path.exists(outdir_inc): + count = count + 1 + outdir_inc = outdir + '-' + str(count) + assert count < 100 + outdir = outdir_inc + os.makedirs(outdir) + return outdir + + +def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False): + rowd = OrderedDict(epoch=epoch) + rowd.update([('train_' + k, v) for k, v in train_metrics.items()]) + rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()]) + with open(filename, mode='a') as cf: + dw = csv.DictWriter(cf, fieldnames=rowd.keys()) + if write_header: # first iteration (epoch == 1 can't be used) + dw.writeheader() + dw.writerow(rowd) diff --git a/train.py b/train.py index 260de18b..023a51ca 100755 --- a/train.py +++ b/train.py @@ -17,9 +17,13 @@ Hacked together by / Copyright 2020 Ross Wightman (https://github.com/rwightman) import argparse import time import yaml -from datetime import datetime +import os +import logging +from collections import OrderedDict from contextlib import suppress +from datetime import datetime +import torch import torch.nn as nn import torchvision.utils from torch.nn.parallel import DistributedDataParallel as NativeDDP