diff --git a/clean_checkpoint.py b/clean_checkpoint.py index a8edcc91..1553fc4b 100755 --- a/clean_checkpoint.py +++ b/clean_checkpoint.py @@ -14,6 +14,9 @@ import hashlib import shutil from collections import OrderedDict +from timm.models.helpers import load_state_dict +from timm.utils import setup_default_logging + parser = argparse.ArgumentParser(description='PyTorch Checkpoint Cleaner') parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') @@ -29,6 +32,7 @@ _TEMP_NAME = './_checkpoint.pth' def main(): args = parser.parse_args() + setup_default_logging() if os.path.exists(args.output): print("Error: Output filename ({}) already exists.".format(args.output)) @@ -37,17 +41,8 @@ def main(): # Load an existing checkpoint to CPU, strip everything but the state_dict and re-save if args.checkpoint and os.path.isfile(args.checkpoint): print("=> Loading checkpoint '{}'".format(args.checkpoint)) - checkpoint = torch.load(args.checkpoint, map_location='cpu') - + state_dict = load_state_dict(args.checkpoint, use_ema=args.use_ema) new_state_dict = OrderedDict() - if isinstance(checkpoint, dict): - state_dict_key = 'state_dict_ema' if args.use_ema else 'state_dict' - if state_dict_key in checkpoint: - state_dict = checkpoint[state_dict_key] - else: - state_dict = checkpoint - else: - assert False for k, v in state_dict.items(): if args.clean_aux_bn and 'aux_bn' in k: # If all aux_bn keys are removed, the SplitBN layers will end up as normal and diff --git a/inference.py b/inference.py index 5fcf1e60..1f248dc7 100755 --- a/inference.py +++ b/inference.py @@ -13,7 +13,7 @@ import numpy as np import torch from timm.models import create_model, apply_test_time_pool -from timm.data import ImageDataset, create_loader, resolve_data_config +from timm.data import ImageDataset, create_loader_v2, resolve_data_config from timm.utils import AverageMeter, setup_default_logging torch.backends.cudnn.benchmark = True @@ -82,7 +82,7 @@ def main(): else: model = model.cuda() - loader = create_loader( + loader = create_loader_v2( ImageDataset(args.data), input_size=config['input_size'], batch_size=args.batch_size, diff --git a/timm/bits/device_env.py b/timm/bits/device_env.py index 0a926e69..e992ee80 100644 --- a/timm/bits/device_env.py +++ b/timm/bits/device_env.py @@ -128,6 +128,9 @@ class DeviceEnv: def mark_step(self): pass # NO-OP for non-XLA devices + def synchronize(self, tensors: Optional[TensorList] = None): + pass + def all_reduce_(self, tensor: TensorList, op=dist.ReduceOp.SUM, average=False): dist.all_reduce(tensor, op=op) if average: diff --git a/timm/bits/device_env_cuda.py b/timm/bits/device_env_cuda.py index c57dfda5..33760d97 100644 --- a/timm/bits/device_env_cuda.py +++ b/timm/bits/device_env_cuda.py @@ -6,7 +6,7 @@ from typing import Optional import torch from torch.nn.parallel import DistributedDataParallel, DataParallel -from .device_env import DeviceEnv, DeviceEnvType +from .device_env import DeviceEnv, DeviceEnvType, TensorList def is_cuda_available(): @@ -63,3 +63,6 @@ class DeviceEnvCuda(DeviceEnv): assert not self.distributed wrapped = [DataParallel(m, **kwargs) for m in modules] return wrapped[0] if len(wrapped) == 1 else wrapped + + def synchronize(self, tensors: Optional[TensorList] = None): + torch.cuda.synchronize(self.device) diff --git a/timm/bits/device_env_xla.py b/timm/bits/device_env_xla.py index 46517f7a..2dad9273 100644 --- a/timm/bits/device_env_xla.py +++ b/timm/bits/device_env_xla.py @@ -8,9 +8,11 @@ from torch.distributed import ReduceOp try: import torch_xla.core.xla_model as xm + import torch_xla _HAS_XLA = True except ImportError as e: xm = None + torch_xla = None _HAS_XLA = False try: @@ -81,6 +83,9 @@ class DeviceEnvXla(DeviceEnv): def mark_step(self): xm.mark_step() + def synchronize(self, tensors: Optional[TensorList] = None): + torch_xla._XLAC._xla_sync_multi(tensors, devices=[], wait=True, sync_xla_data=True) + def all_reduce(self, tensor: torch.Tensor, op=ReduceOp.SUM, average=False): assert isinstance(tensor, torch.Tensor) # unlike in-place variant, lists/tuples not allowed op = _PT_TO_XM_OP[op] diff --git a/timm/bits/train_setup.py b/timm/bits/train_setup.py index 1480de63..5aca908f 100644 --- a/timm/bits/train_setup.py +++ b/timm/bits/train_setup.py @@ -89,7 +89,6 @@ def setup_model_and_optimizer( train_state = TrainState(model=model, updater=updater, model_ema=model_ema) if resume_path: - # FIXME this is not implemented yet, do a hack job before proper TrainState serialization? load_train_state( train_state, resume_path, @@ -141,11 +140,7 @@ def setup_model_and_optimizer_deepspeed( if resume_path: # FIXME deepspeed resumes differently - load_legacy_checkpoint( - train_state, - resume_path, - load_opt=resume_opt, - log_info=dev_env.primary) + assert False if dev_env.distributed: train_state = dataclasses.replace( diff --git a/timm/data/__init__.py b/timm/data/__init__.py index 7d3cb2b4..163bcea7 100644 --- a/timm/data/__init__.py +++ b/timm/data/__init__.py @@ -4,9 +4,9 @@ from .config import resolve_data_config from .constants import * from .dataset import ImageDataset, IterableImageDataset, AugMixDataset from .dataset_factory import create_dataset -from .loader import create_loader +from .loader import create_loader_v2, PreprocessCfg, AugCfg, MixupCfg from .mixup import Mixup, FastCollateMixup from .parsers import create_parser from .real_labels import RealLabelsImagenet -from .transforms import * -from .transforms_factory import create_transform \ No newline at end of file +from .transforms import RandomResizedCropAndInterpolation, ToTensor, ToNumpy +from .transforms_factory import create_transform_v2, create_transform diff --git a/timm/data/auto_augment.py b/timm/data/auto_augment.py index 7cbd2dee..46c36531 100644 --- a/timm/data/auto_augment.py +++ b/timm/data/auto_augment.py @@ -41,6 +41,22 @@ _HPARAMS_DEFAULT = dict( _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) +def _pil_interp(method): + def _convert(m): + if method == 'bicubic': + return Image.BICUBIC + elif method == 'lanczos': + return Image.LANCZOS + elif method == 'hamming': + return Image.HAMMING + else: + return Image.BILINEAR + if isinstance(method, (list, tuple)): + return [_convert(m) if isinstance(m, str) else m for m in method] + else: + return _convert(method) if isinstance(method, str) else method + + def _interpolation(kwargs): interpolation = kwargs.pop('resample', Image.BILINEAR) if isinstance(interpolation, (list, tuple)): @@ -325,7 +341,7 @@ class AugmentOp: self.hparams = hparams.copy() self.kwargs = dict( fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL, - resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION, + resample=_pil_interp(hparams['interpolation']) if 'interpolation' in hparams else _RANDOM_INTERPOLATION, ) # If magnitude_std is > 0, we introduce some randomness diff --git a/timm/data/collate.py b/timm/data/collate.py index a1e37e1f..28f2af2a 100644 --- a/timm/data/collate.py +++ b/timm/data/collate.py @@ -30,7 +30,7 @@ def fast_collate(batch): elif isinstance(batch[0][0], torch.Tensor): targets = torch.tensor([b[1] for b in batch], dtype=torch.int64) assert len(targets) == batch_size - tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=batch[0][0].dtype) for i in range(batch_size): tensor[i].copy_(batch[i][0]) return tensor, targets diff --git a/timm/data/config.py b/timm/data/config.py index 06920d7d..f9ed7b6c 100644 --- a/timm/data/config.py +++ b/timm/data/config.py @@ -1,10 +1,53 @@ import logging +from dataclasses import dataclass +from typing import Tuple, Optional, Union + from .constants import * _logger = logging.getLogger(__name__) +@dataclass +class AugCfg: + scale_range: Tuple[float, float] = (0.08, 1.0) + ratio_range: Tuple[float, float] = (3 / 4, 4 / 3) + hflip_prob: float = 0.5 + vflip_prob: float = 0. + + color_jitter: float = 0.4 + auto_augment: Optional[str] = None + + re_prob: float = 0. + re_mode: str = 'const' + re_count: int = 1 + + num_aug_splits: int = 0 + + +@dataclass +class PreprocessCfg: + input_size: Tuple[int, int, int] = (3, 224, 224) + mean: Tuple[float, ...] = IMAGENET_DEFAULT_MEAN + std: Tuple[float, ...] = IMAGENET_DEFAULT_STD + interpolation: str = 'bilinear' + crop_pct: float = 0.875 + aug: AugCfg = None + + +@dataclass +class MixupCfg: + prob: float = 1.0 + switch_prob: float = 0.5 + mixup_alpha: float = 1. + cutmix_alpha: float = 0. + cutmix_minmax: Optional[Tuple[float, float]] = None + mode: str = 'batch' + correct_lam: bool = True + label_smoothing: float = 0.1 + num_classes: int = 0 + + def resolve_data_config(args, default_cfg={}, model=None, use_test_size=False, verbose=False): new_config = {} default_cfg = default_cfg diff --git a/timm/data/fetcher.py b/timm/data/fetcher.py index ec5afe8a..c833b596 100644 --- a/timm/data/fetcher.py +++ b/timm/data/fetcher.py @@ -2,7 +2,7 @@ import torch from .constants import * from .random_erasing import RandomErasing -from. mixup import FastCollateMixup +from .mixup import FastCollateMixup class FetcherXla: @@ -12,31 +12,55 @@ class FetcherXla: class Fetcher: - def __init__(self, - loader, - mean=IMAGENET_DEFAULT_MEAN, - std=IMAGENET_DEFAULT_STD, - device=None, - dtype=None, - re_prob=0., - re_mode='const', - re_count=1, - re_num_splits=0): + def __init__( + self, + loader, + device: torch.device, + dtype=torch.float32, + normalize=True, + normalize_shape=(1, 3, 1, 1), + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='const', + re_count=1, + num_aug_splits=0, + use_mp_loader=False, + ): self.loader = loader self.device = torch.device(device) - self.dtype = dtype or torch.float32 - self.mean = torch.tensor([x * 255 for x in mean], dtype=self.dtype, device=self.device).view(1, 3, 1, 1) - self.std = torch.tensor([x * 255 for x in std], dtype=self.dtype, device=self.device).view(1, 3, 1, 1) + self.dtype = dtype + if normalize: + self.mean = torch.tensor( + [x * 255 for x in mean], dtype=self.dtype, device=self.device).view(normalize_shape) + self.std = torch.tensor( + [x * 255 for x in std], dtype=self.dtype, device=self.device).view(normalize_shape) + else: + self.mean = None + self.std = None if re_prob > 0.: + # NOTE RandomErasing shouldn't be used here w/ XLA devices self.random_erasing = RandomErasing( - probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device=device) + probability=re_prob, mode=re_mode, count=re_count, num_splits=num_aug_splits) else: self.random_erasing = None + self.use_mp_loader = use_mp_loader + if use_mp_loader: + # FIXME testing for TPU use + import torch_xla.distributed.parallel_loader as pl + self._loader = pl.MpDeviceLoader(loader, device) + else: + self._loader = loader + print('re', self.random_erasing, self.mean, self.std) def __iter__(self): - for sample, target in self.loader: - sample = sample.to(device=self.device, dtype=self.dtype).sub_(self.mean).div_(self.std) - target = target.to(device=self.device) + for sample, target in self._loader: + if not self.use_mp_loader: + sample = sample.to(device=self.device) + target = target.to(device=self.device) + sample = sample.to(dtype=self.dtype) + if self.mean is not None: + sample.sub_(self.mean).div_(self.std) if self.random_erasing is not None: sample = self.random_erasing(sample) yield sample, target diff --git a/timm/data/loader.py b/timm/data/loader.py index e8722b29..9d60cd59 100644 --- a/timm/data/loader.py +++ b/timm/data/loader.py @@ -6,74 +6,52 @@ https://github.com/NVIDIA/apex/commit/d5e2bb4bdeedd27b1dfaf5bb2b24d6c000dee9be#d Hacked together by / Copyright 2020 Ross Wightman """ +from typing import Tuple, Optional, Union, Callable + import torch.utils.data from timm.bits import DeviceEnv - -from .fetcher import Fetcher -from .prefetcher_cuda import PrefetcherCuda from .collate import fast_collate -from .transforms_factory import create_transform -from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .config import PreprocessCfg, AugCfg, MixupCfg from .distributed_sampler import OrderedDistributedSampler +from .fetcher import Fetcher +from .mixup import FastCollateMixup +from .prefetcher_cuda import PrefetcherCuda -def create_loader( - dataset, - input_size, - batch_size, - is_training=False, - dev_env=None, - no_aug=False, - re_prob=0., - re_mode='const', - re_count=1, - re_split=False, - scale=None, - ratio=None, - hflip=0.5, - vflip=0., - color_jitter=0.4, - auto_augment=None, - num_aug_splits=0, - interpolation='bilinear', - mean=IMAGENET_DEFAULT_MEAN, - std=IMAGENET_DEFAULT_STD, - num_workers=1, - crop_pct=None, - collate_fn=None, - pin_memory=False, - tf_preprocessing=False, - use_multi_epochs_loader=False, - persistent_workers=True, +def create_loader_v2( + dataset: torch.utils.data.Dataset, + batch_size: int, + is_training: bool = False, + dev_env: Optional[DeviceEnv] = None, + normalize=True, + pp_cfg: PreprocessCfg = PreprocessCfg(), + mix_cfg: MixupCfg = None, + num_workers: int = 1, + collate_fn: Optional[Callable] = None, + pin_memory: bool = False, + use_multi_epochs_loader: bool = False, + persistent_workers: bool = True, ): - re_num_splits = 0 - if re_split: - # apply RE to second half of batch if no aug split otherwise line up with aug split - re_num_splits = num_aug_splits or 2 - dataset.transform = create_transform( - input_size, - is_training=is_training, - use_fetcher=True, - no_aug=no_aug, - scale=scale, - ratio=ratio, - hflip=hflip, - vflip=vflip, - color_jitter=color_jitter, - auto_augment=auto_augment, - interpolation=interpolation, - mean=mean, - std=std, - crop_pct=crop_pct, - tf_preprocessing=tf_preprocessing, - re_prob=re_prob, - re_mode=re_mode, - re_count=re_count, - re_num_splits=re_num_splits, - separate=num_aug_splits > 0, - ) + """ + + Args: + dataset: + batch_size: + is_training: + dev_env: + normalize: + pp_cfg: + mix_cfg: + num_workers: + collate_fn: + pin_memory: + use_multi_epochs_loader: + persistent_workers: + + Returns: + """ if dev_env is None: dev_env = DeviceEnv.instance() @@ -85,10 +63,24 @@ def create_loader( else: # This will add extra duplicate entries to result in equal num # of samples per-process, will slightly alter validation results - sampler = OrderedDistributedSampler(dataset, num_replicas=dev_env.world_size, rank=dev_env.global_rank) + sampler = OrderedDistributedSampler( + dataset, num_replicas=dev_env.world_size, rank=dev_env.global_rank) if collate_fn is None: - collate_fn = fast_collate + if mix_cfg is not None and mix_cfg.prob > 0: + collate_fn = FastCollateMixup( + mixup_alpha=mix_cfg.mixup_alpha, + cutmix_alpha=mix_cfg.cutmix_alpha, + cutmix_minmax=mix_cfg.cutmix_minmax, + prob=mix_cfg.prob, + switch_prob=mix_cfg.switch_prob, + mode=mix_cfg.mode, + correct_lam=mix_cfg.correct_lam, + label_smoothing=mix_cfg.label_smoothing, + num_classes=mix_cfg.num_classes, + ) + else: + collate_fn = fast_collate loader_class = torch.utils.data.DataLoader if use_multi_epochs_loader: @@ -110,13 +102,18 @@ def create_loader( loader = loader_class(dataset, **loader_args) fetcher_kwargs = dict( - mean=mean, - std=std, - re_prob=re_prob if is_training and not no_aug else 0., - re_mode=re_mode, - re_count=re_count, - re_num_splits=re_num_splits + normalize=normalize, + mean=pp_cfg.mean, + std=pp_cfg.std, ) + if normalize and is_training and pp_cfg.aug is not None: + fetcher_kwargs.update(dict( + re_prob=pp_cfg.aug.re_prob, + re_mode=pp_cfg.aug.re_mode, + re_count=pp_cfg.aug.re_count, + num_aug_splits=pp_cfg.aug.num_aug_splits, + )) + if dev_env.type_cuda: loader = PrefetcherCuda(loader, **fetcher_kwargs) else: diff --git a/timm/data/mixup.py b/timm/data/mixup.py index 38477548..b618bb7c 100644 --- a/timm/data/mixup.py +++ b/timm/data/mixup.py @@ -102,7 +102,7 @@ class Mixup: num_classes (int): number of classes for target """ def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5, - mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000): + mode='batch', correct_lam=True, label_smoothing=0., num_classes=0): self.mixup_alpha = mixup_alpha self.cutmix_alpha = cutmix_alpha self.cutmix_minmax = cutmix_minmax @@ -113,6 +113,8 @@ class Mixup: self.mix_prob = prob self.switch_prob = switch_prob self.label_smoothing = label_smoothing + if label_smoothing > 0.: + assert num_classes > 0 self.num_classes = num_classes self.mode = mode self.correct_lam = correct_lam # correct lambda based on clipped area for cutmix @@ -218,17 +220,30 @@ class Mixup: return x, target +def blend(a, b, lam, is_tensor=False, round_output=True): + if is_tensor: + blend = a.to(dtype=torch.float32) * lam + b.to(dtype=torch.float32) * (1 - lam) + if round_output: + torch.round(blend, out=blend) + else: + blend = a.astype(np.float32) * lam + b.astype(np.float32) * (1 - lam) + if round_output: + np.rint(blend, out=blend) + return blend + + class FastCollateMixup(Mixup): """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch A Mixup impl that's performed while collating the batches. """ - def _mix_elem_collate(self, output, batch, half=False): + def _mix_elem_collate(self, output, batch, half=False, is_tensor=False): batch_size = len(batch) num_elem = batch_size // 2 if half else batch_size assert len(output) == num_elem lam_batch, use_cutmix = self._params_per_elem(num_elem) + round_output = output.dtype == torch.uint8 for i in range(num_elem): j = batch_size - i - 1 lam = lam_batch[i] @@ -236,22 +251,23 @@ class FastCollateMixup(Mixup): if lam != 1.: if use_cutmix[i]: if not half: - mixed = mixed.copy() + mixed = mixed.clone() if is_tensor else mixed.copy() # don't want to modify while iterating (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] lam_batch[i] = lam else: - mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) - np.rint(mixed, out=mixed) - output[i] += torch.from_numpy(mixed.astype(np.uint8)) + mixed = blend(mixed, batch[j][0], lam, is_tensor, round_output) + mixed = mixed.to(dtype=output.dtype) if is_tensor else torch.from_numpy(mixed.astype(np.uint8)) + output[i].copy_(mixed) if half: lam_batch = np.concatenate((lam_batch, np.ones(num_elem))) return torch.tensor(lam_batch).unsqueeze(1) - def _mix_pair_collate(self, output, batch): + def _mix_pair_collate(self, output, batch, is_tensor=False): batch_size = len(batch) lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + round_output = output.dtype == torch.uint8 for i in range(batch_size // 2): j = batch_size - i - 1 lam = lam_batch[i] @@ -262,24 +278,30 @@ class FastCollateMixup(Mixup): if use_cutmix[i]: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) - patch_i = mixed_i[:, yl:yh, xl:xh].copy() + patch_i = mixed_i[:, yl:yh, xl:xh] + patch_i = patch_i.clone() if is_tensor else patch_i.copy() # don't want to modify while iterating mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh] mixed_j[:, yl:yh, xl:xh] = patch_i lam_batch[i] = lam else: - mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam) - mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam) + mixed_temp = blend(mixed_i, mixed_j, lam, is_tensor, round_output) + mixed_j = blend(mixed_j, mixed_i, lam, is_tensor, round_output) mixed_i = mixed_temp - np.rint(mixed_j, out=mixed_j) - np.rint(mixed_i, out=mixed_i) - output[i] += torch.from_numpy(mixed_i.astype(np.uint8)) - output[j] += torch.from_numpy(mixed_j.astype(np.uint8)) + if is_tensor: + mixed_i = mixed_i.to(dtype=output.dtype) + mixed_j = mixed_j.to(dtype=output.dtype) + else: + mixed_i = torch.from_numpy(mixed_i.astype(np.uint8)) + mixed_j = torch.from_numpy(mixed_j.astype(np.uint8)) + output[i].copy_(mixed_i) + output[j].copy_(mixed_j) lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) return torch.tensor(lam_batch).unsqueeze(1) - def _mix_batch_collate(self, output, batch): + def _mix_batch_collate(self, output, batch, is_tensor=False): batch_size = len(batch) lam, use_cutmix = self._params_per_batch() + round_output = output.dtype == torch.uint8 if use_cutmix: (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) @@ -288,12 +310,12 @@ class FastCollateMixup(Mixup): mixed = batch[i][0] if lam != 1.: if use_cutmix: - mixed = mixed.copy() # don't want to modify the original while iterating + mixed = mixed.clone() if is_tensor else mixed.copy() # don't want to modify while iterating mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] else: - mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) - np.rint(mixed, out=mixed) - output[i] += torch.from_numpy(mixed.astype(np.uint8)) + mixed = blend(mixed, batch[j][0], lam, is_tensor, round_output) + mixed = mixed.to(dtype=output.dtype) if is_tensor else torch.from_numpy(mixed.astype(np.uint8)) + output[i].copy_(mixed) return lam def __call__(self, batch, _=None): @@ -302,13 +324,15 @@ class FastCollateMixup(Mixup): half = 'half' in self.mode if half: batch_size //= 2 - output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + is_tensor = isinstance(batch[0][0], torch.Tensor) + output_dtype = batch[0][0].dtype if is_tensor else torch.uint8 # always uint8 if numpy src + output = torch.zeros((batch_size, *batch[0][0].shape), dtype=output_dtype) if self.mode == 'elem' or self.mode == 'half': - lam = self._mix_elem_collate(output, batch, half=half) + lam = self._mix_elem_collate(output, batch, half=half, is_tensor=is_tensor) elif self.mode == 'pair': - lam = self._mix_pair_collate(output, batch) + lam = self._mix_pair_collate(output, batch, is_tensor=is_tensor) else: - lam = self._mix_batch_collate(output, batch) + lam = self._mix_batch_collate(output, batch, is_tensor=is_tensor) target = torch.tensor([b[1] for b in batch], dtype=torch.int64) target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu') target = target[:batch_size] diff --git a/timm/data/prefetcher_cuda.py b/timm/data/prefetcher_cuda.py index 4f1c4e10..9432df59 100644 --- a/timm/data/prefetcher_cuda.py +++ b/timm/data/prefetcher_cuda.py @@ -7,25 +7,34 @@ from .random_erasing import RandomErasing class PrefetcherCuda: - def __init__(self, - loader, - mean=IMAGENET_DEFAULT_MEAN, - std=IMAGENET_DEFAULT_STD, - fp16=False, - re_prob=0., - re_mode='const', - re_count=1, - re_num_splits=0): + def __init__( + self, + loader, + device: torch.device = torch.device('cuda'), + dtype=torch.float32, + normalize=True, + normalize_shape=(1, 3, 1, 1), + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + num_aug_splits=0, + re_prob=0., + re_mode='const', + re_count=1 + ): self.loader = loader - self.mean = torch.tensor([x * 255 for x in mean]).cuda().view(1, 3, 1, 1) - self.std = torch.tensor([x * 255 for x in std]).cuda().view(1, 3, 1, 1) - self.fp16 = fp16 - if fp16: - self.mean = self.mean.half() - self.std = self.std.half() + self.device = device + self.dtype = dtype + if normalize: + self.mean = torch.tensor( + [x * 255 for x in mean], dtype=self.dtype, device=self.device).view(normalize_shape) + self.std = torch.tensor( + [x * 255 for x in std], dtype=self.dtype, device=self.device).view(normalize_shape) + else: + self.mean = None + self.std = None if re_prob > 0.: self.random_erasing = RandomErasing( - probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits) + probability=re_prob, mode=re_mode, count=re_count, num_splits=num_aug_splits, device=device) else: self.random_erasing = None @@ -35,12 +44,11 @@ class PrefetcherCuda: for next_input, next_target in self.loader: with torch.cuda.stream(stream): - next_input = next_input.cuda(non_blocking=True) - next_target = next_target.cuda(non_blocking=True) - if self.fp16: - next_input = next_input.half().sub_(self.mean).div_(self.std) - else: - next_input = next_input.float().sub_(self.mean).div_(self.std) + next_input = next_input.to(device=self.device, non_blocking=True) + next_input = next_input.to(dtype=self.dtype) + if self.mean is not None: + next_input.sub_(self.mean).div_(self.std) + next_target = next_target.to(device=self.device, non_blocking=True) if self.random_erasing is not None: next_input = self.random_erasing(next_input) @@ -76,4 +84,4 @@ class PrefetcherCuda: @mixup_enabled.setter def mixup_enabled(self, x): if isinstance(self.loader.collate_fn, FastCollateMixup): - self.loader.collate_fn.mixup_enabled = x \ No newline at end of file + self.loader.collate_fn.mixup_enabled = x diff --git a/timm/data/random_erasing.py b/timm/data/random_erasing.py index 78967d10..65d085a9 100644 --- a/timm/data/random_erasing.py +++ b/timm/data/random_erasing.py @@ -38,21 +38,20 @@ class RandomErasing: 'const' - erase block is constant color of 0 for all channels 'rand' - erase block is same per-channel random (normal) color 'pixel' - erase block is per-pixel random (normal) color - max_count: maximum number of erasing blocks per image, area per box is scaled by count. + count: maximum number of erasing blocks per image, area per box is scaled by count. per-image count is randomly chosen between 1 and this value. """ def __init__( self, probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, - mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'): + mode='const', count=1, num_splits=0): self.probability = probability self.min_area = min_area self.max_area = max_area max_aspect = max_aspect or 1 / min_aspect self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) - self.min_count = min_count - self.max_count = max_count or min_count + self.count = count self.num_splits = num_splits mode = mode.lower() self.rand_color = False @@ -63,14 +62,13 @@ class RandomErasing: self.per_pixel = True # per pixel random normal else: assert not mode or mode == 'const' - self.device = device def _erase(self, img, chan, img_h, img_w, dtype): + device = img.device if random.random() > self.probability: return area = img_h * img_w - count = self.min_count if self.min_count == self.max_count else \ - random.randint(self.min_count, self.max_count) + count = random.randint(1, self.count) if self.count > 1 else self.count for _ in range(count): for attempt in range(10): target_area = random.uniform(self.min_area, self.max_area) * area / count @@ -81,17 +79,76 @@ class RandomErasing: top = random.randint(0, img_h - h) left = random.randint(0, img_w - w) img[:, top:top + h, left:left + w] = _get_pixels( - self.per_pixel, self.rand_color, (chan, h, w), - dtype=dtype, device=self.device) + self.per_pixel, self.rand_color, (chan, h, w), dtype=dtype, device=device) break - def __call__(self, input): - if len(input.size()) == 3: - self._erase(input, *input.size(), input.dtype) + def __call__(self, x): + if len(x.size()) == 3: + self._erase(x, *x.shape, x.dtype) else: - batch_size, chan, img_h, img_w = input.size() + batch_size, chan, img_h, img_w = x.shape # skip first slice of batch if num_splits is set (for clean portion of samples) batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 for i in range(batch_start, batch_size): - self._erase(input[i], chan, img_h, img_w, input.dtype) - return input + self._erase(x[i], chan, img_h, img_w, x.dtype) + return x + + +class RandomErasingMasked: + """ Randomly selects a rectangle region in an image and erases its pixels. + 'Random Erasing Data Augmentation' by Zhong et al. + See https://arxiv.org/pdf/1708.04896.pdf + + This variant of RandomErasing is intended to be applied to either a batch + or single image tensor after it has been normalized by dataset mean and std. + Args: + probability: Probability that the Random Erasing operation will be performed for each box (count) + min_area: Minimum percentage of erased area wrt input image area. + max_area: Maximum percentage of erased area wrt input image area. + min_aspect: Minimum aspect ratio of erased area. + count: maximum number of erasing blocks per image, area per box is scaled by count. + per-image count is between 0 and this value. + """ + + def __init__( + self, + probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', count=1, num_splits=0): + self.probability = probability + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.mode = mode # FIXME currently ignored, add back options besides normal mean=0, std=1 noise? + self.count = count + self.num_splits = num_splits + + @torch.no_grad() + def __call__(self, x: torch.Tensor) -> torch.Tensor: + device = x.device + batch_size, _, img_h, img_w = x.shape + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + + # NOTE simplified from v1 with with one count value and same prob applied for all + enable = (torch.empty((batch_size, self.count), device=device).uniform_() < self.probability).float() + enable = enable / enable.sum(dim=1, keepdim=True).clamp(min=1) + target_area = torch.empty( + (batch_size, self.count), device=device).uniform_(self.min_area, self.max_area) * enable + aspect_ratio = torch.empty((batch_size, self.count), device=device).uniform_(*self.log_aspect_ratio).exp() + h_coord = torch.arange(0, img_h, device=device).unsqueeze(-1).expand(-1, self.count).float() + w_coord = torch.arange(0, img_w, device=device).unsqueeze(-1).expand(-1, self.count).float() + h_mid = torch.rand((batch_size, self.count), device=device) * img_h + w_mid = torch.rand((batch_size, self.count), device=device) * img_w + noise = torch.empty_like(x[0]).normal_() + + for i in range(batch_start, batch_size): + h_half = (img_h / 2) * torch.sqrt(target_area[i] * aspect_ratio[i]) # 1/2 box h + h_mask = (h_coord > (h_mid[i] - h_half)) & (h_coord < (h_mid[i] + h_half)) + w_half = (img_w / 2) * torch.sqrt(target_area[i] / aspect_ratio[i]) # 1/2 box w + w_mask = (w_coord > (w_mid[i] - w_half)) & (w_coord < (w_mid[i] + w_half)) + #mask = (h_mask.unsqueeze(1) & w_mask.unsqueeze(0)).any(dim=-1) + #x[i].copy_(torch.where(mask, noise, x[i])) + mask = ~(h_mask.unsqueeze(1) & w_mask.unsqueeze(0)).any(dim=-1) + x[i] = x[i].where(mask, noise) + #x[i].masked_scatter_(mask, noise) + return x diff --git a/timm/data/transforms.py b/timm/data/transforms.py index 4220304f..03f0e825 100644 --- a/timm/data/transforms.py +++ b/timm/data/transforms.py @@ -1,5 +1,7 @@ import torch import torchvision.transforms.functional as F +from torchvision.transforms import InterpolationMode + from PIL import Image import warnings import math @@ -30,29 +32,40 @@ class ToTensor: return torch.from_numpy(np_img).to(dtype=self.dtype) -_pil_interpolation_to_str = { - Image.NEAREST: 'PIL.Image.NEAREST', - Image.BILINEAR: 'PIL.Image.BILINEAR', - Image.BICUBIC: 'PIL.Image.BICUBIC', - Image.LANCZOS: 'PIL.Image.LANCZOS', - Image.HAMMING: 'PIL.Image.HAMMING', - Image.BOX: 'PIL.Image.BOX', -} +class ToTensorNormalize: + def __init__(self, mean, std, dtype=torch.float32, device=torch.device('cpu')): + self.dtype = dtype + mean = torch.as_tensor(mean, dtype=dtype, device=device) + std = torch.as_tensor(std, dtype=dtype, device=device) + if (std == 0).any(): + raise ValueError('std evaluated to zero after conversion to {}, leading to division by zero.'.format(dtype)) + if mean.ndim == 1: + mean = mean.view(-1, 1, 1) + if std.ndim == 1: + std = std.view(-1, 1, 1) + self.mean = mean + self.std = std -def _pil_interp(method): - if method == 'bicubic': - return Image.BICUBIC - elif method == 'lanczos': - return Image.LANCZOS - elif method == 'hamming': - return Image.HAMMING - else: - # default bilinear, do we want to allow nearest? - return Image.BILINEAR + def __call__(self, pil_img): + mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32} + img = torch.from_numpy( + np.array(pil_img, mode_to_nptype.get(pil_img.mode, np.uint8)) + ) + if pil_img.mode == '1': + img = 255 * img + img = img.view(pil_img.size[1], pil_img.size[0], len(pil_img.getbands())) + img = img.permute((2, 0, 1)) + if isinstance(img, torch.ByteTensor): + img = img.to(self.dtype) + img.sub_(self.mean * 255.).div_(self.std * 255.) + else: + img = img.to(self.dtype) + img.sub_(self.mean).div_(self.std) + return img -_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) +_RANDOM_INTERPOLATION = (InterpolationMode.BILINEAR, InterpolationMode.BICUBIC) class RandomResizedCropAndInterpolation: @@ -82,7 +95,7 @@ class RandomResizedCropAndInterpolation: if interpolation == 'random': self.interpolation = _RANDOM_INTERPOLATION else: - self.interpolation = _pil_interp(interpolation) + self.interpolation = InterpolationMode(interpolation) self.scale = scale self.ratio = ratio @@ -146,9 +159,9 @@ class RandomResizedCropAndInterpolation: def __repr__(self): if isinstance(self.interpolation, (tuple, list)): - interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + interpolate_str = ' '.join([x.value for x in self.interpolation]) else: - interpolate_str = _pil_interpolation_to_str[self.interpolation] + interpolate_str = self.interpolation.value format_string = self.__class__.__name__ + '(size={0}'.format(self.size) format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) diff --git a/timm/data/transforms_factory.py b/timm/data/transforms_factory.py index 16e08a39..1c8d15e2 100644 --- a/timm/data/transforms_factory.py +++ b/timm/data/transforms_factory.py @@ -4,59 +4,50 @@ Factory methods for building image transforms for use with TIMM (PyTorch Image M Hacked together by / Copyright 2020 Ross Wightman """ import math +from typing import Union, Tuple import torch from torchvision import transforms -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT from timm.data.auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform -from timm.data.transforms import _pil_interp, RandomResizedCropAndInterpolation, ToNumpy, ToTensor +from timm.data.config import PreprocessCfg, AugCfg +from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT from timm.data.random_erasing import RandomErasing +from timm.data.transforms import RandomResizedCropAndInterpolation, ToNumpy, ToTensorNormalize def transforms_noaug_train( - img_size=224, + img_size: Union[int, Tuple[int]] = 224, interpolation='bilinear', - use_prefetcher=False, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, + normalize=False, ): if interpolation == 'random': # random interpolation not supported with no-aug interpolation = 'bilinear' tfl = [ - transforms.Resize(img_size, _pil_interp(interpolation)), + transforms.Resize(img_size, transforms.InterpolationMode(interpolation)), transforms.CenterCrop(img_size) ] - if use_prefetcher: - # prefetcher and collate will handle tensor conversion and norm - tfl += [ToNumpy()] - else: + if normalize: tfl += [ transforms.ToTensor(), - transforms.Normalize( - mean=torch.tensor(mean), - std=torch.tensor(std)) + transforms.Normalize(mean=torch.tensor(mean), std=torch.tensor(std)) ] + else: + # (pre)fetcher and collate will handle tensor conversion and normalize + tfl += [ToNumpy()] return transforms.Compose(tfl) def transforms_imagenet_train( - img_size=224, - scale=None, - ratio=None, - hflip=0.5, - vflip=0., - color_jitter=0.4, - auto_augment=None, + img_size: Union[int, Tuple[int]] = 224, interpolation='random', - use_prefetcher=False, mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, - re_prob=0., - re_mode='const', - re_count=1, - re_num_splits=0, + aug_cfg=AugCfg(), + normalize=False, separate=False, ): """ @@ -66,18 +57,24 @@ def transforms_imagenet_train( * a portion of the data through the secondary transform * normalizes and converts the branches above with the third, final transform """ - scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range - ratio = tuple(ratio or (3./4., 4./3.)) # default imagenet ratio range + scale_range = tuple(aug_cfg.scale_range or (0.08, 1.0)) # default imagenet scale range + ratio_range = tuple(aug_cfg.ratio_range or (3. / 4., 4. / 3.)) # default imagenet ratio range + + # 'primary' train transforms include random resize + crop w/ optional horizontal and vertical flipping aug. + # This is the core of standard ImageNet ResNet and Inception pre-processing primary_tfl = [ - RandomResizedCropAndInterpolation(img_size, scale=scale, ratio=ratio, interpolation=interpolation)] - if hflip > 0.: - primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)] - if vflip > 0.: - primary_tfl += [transforms.RandomVerticalFlip(p=vflip)] + RandomResizedCropAndInterpolation(img_size, scale=scale_range, ratio=ratio_range, interpolation=interpolation)] + if aug_cfg.hflip_prob > 0.: + primary_tfl += [transforms.RandomHorizontalFlip(p=aug_cfg.hflip_prob)] + if aug_cfg.vflip_prob > 0.: + primary_tfl += [transforms.RandomVerticalFlip(p=aug_cfg.vflip_prob)] + # 'secondary' transform stage includes either color jitter (could add lighting too) or auto-augmentations + # such as AutoAugment, RandAugment, AugMix, etc secondary_tfl = [] - if auto_augment: - assert isinstance(auto_augment, str) + if aug_cfg.auto_augment: + aa = aug_cfg.auto_augment + assert isinstance(aa, str) if isinstance(img_size, (tuple, list)): img_size_min = min(img_size) else: @@ -87,58 +84,63 @@ def transforms_imagenet_train( img_mean=tuple([min(255, round(255 * x)) for x in mean]), ) if interpolation and interpolation != 'random': - aa_params['interpolation'] = _pil_interp(interpolation) - if auto_augment.startswith('rand'): - secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] - elif auto_augment.startswith('augmix'): + aa_params['interpolation'] = interpolation + if aa.startswith('rand'): + secondary_tfl += [rand_augment_transform(aa, aa_params)] + elif aa.startswith('augmix'): aa_params['translate_pct'] = 0.3 - secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)] + secondary_tfl += [augment_and_mix_transform(aa, aa_params)] else: - secondary_tfl += [auto_augment_transform(auto_augment, aa_params)] - elif color_jitter is not None: + secondary_tfl += [auto_augment_transform(aa, aa_params)] + elif aug_cfg.color_jitter is not None: # color jitter is enabled when not using AA - if isinstance(color_jitter, (list, tuple)): + cj = aug_cfg.color_jitter + if isinstance(cj, (list, tuple)): # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation # or 4 if also augmenting hue - assert len(color_jitter) in (3, 4) + assert len(cj) in (3, 4) else: # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue - color_jitter = (float(color_jitter),) * 3 - secondary_tfl += [transforms.ColorJitter(*color_jitter)] + cj = (float(cj),) * 3 + secondary_tfl += [transforms.ColorJitter(*cj)] + # 'final' transform stage includes normalization, followed by optional random erasing and tensor conversion final_tfl = [] - if use_prefetcher: - # prefetcher and collate will handle tensor conversion and norm - final_tfl += [ToNumpy()] - else: + if normalize: final_tfl += [ - transforms.ToTensor(), - transforms.Normalize( - mean=torch.tensor(mean), - std=torch.tensor(std)) + ToTensorNormalize(mean=mean, std=std) ] - if re_prob > 0.: - final_tfl.append( - RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu')) + if aug_cfg.re_prob > 0.: + final_tfl.append(RandomErasing( + aug_cfg.re_prob, + mode=aug_cfg.re_mode, + count=aug_cfg.re_count, + num_splits=aug_cfg.num_aug_splits)) + else: + # when normalize disabled, (pre)fetcher and collate will handle tensor conversion and normalize + final_tfl += [ToNumpy()] if separate: + # return each transform stage separately return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl) else: return transforms.Compose(primary_tfl + secondary_tfl + final_tfl) def transforms_imagenet_eval( - img_size=224, + img_size: Union[int, Tuple[int]] = 224, crop_pct=None, interpolation='bilinear', - use_prefetcher=False, mean=IMAGENET_DEFAULT_MEAN, - std=IMAGENET_DEFAULT_STD): + std=IMAGENET_DEFAULT_STD, + normalize=False, +): crop_pct = crop_pct or DEFAULT_CROP_PCT if isinstance(img_size, (tuple, list)): assert len(img_size) == 2 if img_size[-1] == img_size[-2]: + # FIXME handle case where img is square and we want non aspect preserving resize # fall-back to older behaviour so Resize scales to shortest edge if target is square scale_size = int(math.floor(img_size[0] / crop_pct)) else: @@ -147,27 +149,87 @@ def transforms_imagenet_eval( scale_size = int(math.floor(img_size / crop_pct)) tfl = [ - transforms.Resize(scale_size, _pil_interp(interpolation)), + transforms.Resize(scale_size, transforms.InterpolationMode(interpolation)), transforms.CenterCrop(img_size), ] - if use_prefetcher: - # prefetcher and collate will handle tensor conversion and norm - tfl += [ToNumpy()] - else: + if normalize: tfl += [ - transforms.ToTensor(), - transforms.Normalize( - mean=torch.tensor(mean), - std=torch.tensor(std)) + ToTensorNormalize(mean=mean, std=std) ] + else: + # (pre)fetcher and collate will handle tensor conversion and normalize + tfl += [ToNumpy()] return transforms.Compose(tfl) +def create_transform_v2( + cfg=PreprocessCfg(), + is_training=False, + normalize=False, + separate=False, + tf_preprocessing=False, +): + """ + + Args: + cfg: Pre-processing configuration + is_training (bool): Create transform for training pre-processing + tf_preprocessing (bool): Use Tensorflow pre-processing (for validation) + normalize (bool): Enable normalization in transforms (otherwise handled by fetcher/pre-fetcher) + separate (bool): Return transforms separated into stages (for train) + + Returns: + + """ + input_size = cfg.input_size + if isinstance(input_size, (tuple, list)): + img_size = input_size[-2:] + else: + img_size = input_size + + if tf_preprocessing: + assert not normalize, "Expecting normalization to be handled in (pre)fetcher w/ TF preprocessing" + assert not separate, "Separate transforms not supported for TF preprocessing" + from timm.data.tf_preprocessing import TfPreprocessTransform + transform = TfPreprocessTransform( + is_training=is_training, size=img_size, interpolation=cfg.interpolation) + else: + if is_training and cfg.aug is None: + assert not separate, "Cannot perform split augmentation with no_aug" + transform = transforms_noaug_train( + img_size, + interpolation=cfg.interpolation, + normalize=normalize, + mean=cfg.mean, + std=cfg.std) + elif is_training: + transform = transforms_imagenet_train( + img_size, + interpolation=cfg.interpolation, + mean=cfg.mean, + std=cfg.std, + aug_cfg=cfg.aug, + normalize=normalize, + separate=separate) + else: + assert not separate, "Separate transforms not supported for validation preprocessing" + transform = transforms_imagenet_eval( + img_size, + interpolation=cfg.interpolation, + crop_pct=cfg.crop_pct, + mean=cfg.mean, + std=cfg.std, + normalize=normalize, + ) + + return transform + + def create_transform( input_size, is_training=False, - use_fetcher=False, + use_prefetcher=False, no_aug=False, scale=None, ratio=None, @@ -191,7 +253,8 @@ def create_transform( else: img_size = input_size - if tf_preprocessing and use_fetcher: + normalize_in_transform = not use_prefetcher + if tf_preprocessing and use_prefetcher: assert not separate, "Separate transforms not supported for TF preprocessing" from timm.data.tf_preprocessing import TfPreprocessTransform transform = TfPreprocessTransform( @@ -202,35 +265,41 @@ def create_transform( transform = transforms_noaug_train( img_size, interpolation=interpolation, - use_prefetcher=use_fetcher, mean=mean, - std=std) + std=std, + normalize=normalize_in_transform, + ) elif is_training: - transform = transforms_imagenet_train( - img_size, - scale=scale, - ratio=ratio, - hflip=hflip, - vflip=vflip, + aug_cfg = AugCfg( + scale_range=scale, + ratio_range=ratio, + hflip_prob=hflip, + vflip_prob=vflip, color_jitter=color_jitter, auto_augment=auto_augment, - interpolation=interpolation, - use_prefetcher=use_fetcher, - mean=mean, - std=std, re_prob=re_prob, re_mode=re_mode, re_count=re_count, - re_num_splits=re_num_splits, - separate=separate) + num_aug_splits=re_num_splits, + ) + transform = transforms_imagenet_train( + img_size, + interpolation=interpolation, + mean=mean, + std=std, + aug_cfg=aug_cfg, + normalize=normalize_in_transform, + separate=separate + ) else: - assert not separate, "Separate transforms not supported for validation preprocessing" + assert not separate, "Separate transforms not supported for validation pre-processing" transform = transforms_imagenet_eval( img_size, interpolation=interpolation, - use_prefetcher=use_fetcher, mean=mean, std=std, - crop_pct=crop_pct) + crop_pct=crop_pct, + normalize=normalize_in_transform, + ) return transform diff --git a/timm/models/helpers.py b/timm/models/helpers.py index adfef550..39f44c87 100644 --- a/timm/models/helpers.py +++ b/timm/models/helpers.py @@ -24,13 +24,20 @@ _logger = logging.getLogger(__name__) def load_state_dict(checkpoint_path, use_ema=False): if checkpoint_path and os.path.isfile(checkpoint_path): checkpoint = torch.load(checkpoint_path, map_location='cpu') - state_dict_key = 'state_dict' + state_dict_key = '' if isinstance(checkpoint, dict): if use_ema and 'state_dict_ema' in checkpoint: state_dict_key = 'state_dict_ema' - if state_dict_key and state_dict_key in checkpoint: + elif use_ema and 'model_ema' in checkpoint: + state_dict_key = 'model_ema' + elif 'state_dict' in checkpoint: + state_dict_key = 'state_dict' + elif 'model' in checkpoint: + state_dict_key = 'model' + if state_dict_key: + state_dict = checkpoint[state_dict_key] new_state_dict = OrderedDict() - for k, v in checkpoint[state_dict_key].items(): + for k, v in state_dict.items(): # strip `module.` prefix name = k[7:] if k.startswith('module') else k new_state_dict[name] = v diff --git a/train.py b/train.py index cca814fd..1e95c831 100755 --- a/train.py +++ b/train.py @@ -30,7 +30,8 @@ import torchvision.utils from timm.bits import initialize_device, setup_model_and_optimizer, DeviceEnv, Monitor, Tracker,\ TrainState, TrainServices, TrainCfg, CheckpointManager, AccuracyTopK, AvgTensor, distribute_bn -from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset +from timm.data import create_dataset, create_transform_v2, create_loader_v2, resolve_data_config,\ + PreprocessCfg, AugCfg, MixupCfg, AugMixDataset from timm.models import create_model, safe_model_name, convert_splitbn_model from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy from timm.optim import optimizer_kwargs @@ -283,10 +284,11 @@ def main(): else: _logger.info('Training with a single process on 1 device.') - mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None - random_seed(args.seed, 0) # Set all random seeds the same for model/state init (mandatory for XLA) + mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None + assert args.aug_splits == 0 or args.aug_splits > 1, 'A split of 1 makes no sense' + train_state = setup_train_task(args, dev_env, mixup_active) train_cfg = train_state.train_cfg @@ -421,11 +423,9 @@ def setup_train_task(args, dev_env: DeviceEnv, mixup_active: bool): _logger.info( f'Model {safe_model_name(args.model)} created, param count:{sum([m.numel() for m in model.parameters()])}') - # setup augmentation batch splits for contrastive loss or split bn - assert args.aug_splits == 0 or args.aug_splits > 1, 'A split of 1 makes no sense' # enable split bn (separate bn stats per batch-portion) if args.split_bn: - assert args.aug_splits > 1 or args.resplit + assert args.aug_splits > 1 model = convert_splitbn_model(model, max(args.aug_splits, 2)) train_state = setup_model_and_optimizer( @@ -481,7 +481,7 @@ def setup_train_task(args, dev_env: DeviceEnv, mixup_active: bool): return train_state -def setup_data(args, default_cfg, dev_env, mixup_active): +def setup_data(args, default_cfg, dev_env: DeviceEnv, mixup_active: bool): data_config = resolve_data_config(vars(args), default_cfg=default_cfg, verbose=dev_env.primary) # create the train and eval datasets @@ -489,18 +489,18 @@ def setup_data(args, default_cfg, dev_env, mixup_active): args.dataset, root=args.data_dir, split=args.train_split, is_training=True, batch_size=args.batch_size, repeats=args.epoch_repeats) + dataset_eval = create_dataset( - args.dataset, root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size) + args.dataset, + root=args.data_dir, split=args.val_split, is_training=False, batch_size=args.batch_size) # setup mixup / cutmix - collate_fn = None + mixup_cfg = None if mixup_active: - mixup_args = dict( - mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, + mixup_cfg = MixupCfg( prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, + mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, label_smoothing=args.smoothing, num_classes=args.num_classes) - assert not args.aug_splits # collate conflict (need to support deinterleaving in collate mixup) - collate_fn = FastCollateMixup(**mixup_args) # wrap dataset in AugMix helper if args.aug_splits > 1: @@ -510,46 +510,72 @@ def setup_data(args, default_cfg, dev_env, mixup_active): train_interpolation = args.train_interpolation if args.no_aug or not train_interpolation: train_interpolation = data_config['interpolation'] - loader_train = create_loader( - dataset_train, + + if args.no_aug: + train_aug_cfg = None + else: + train_aug_cfg = AugCfg( + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + ratio_range=args.ratio, + scale_range=args.scale, + hflip_prob=args.hflip, + vflip_prob=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=args.aug_splits, + ) + + train_pp_cfg = PreprocessCfg( input_size=data_config['input_size'], - batch_size=args.batch_size, - is_training=True, - no_aug=args.no_aug, - re_prob=args.reprob, - re_mode=args.remode, - re_count=args.recount, - re_split=args.resplit, - scale=args.scale, - ratio=args.ratio, - hflip=args.hflip, - vflip=args.vflip, - color_jitter=args.color_jitter, - auto_augment=args.aa, - num_aug_splits=args.aug_splits, interpolation=train_interpolation, + crop_pct=data_config['crop_pct'], mean=data_config['mean'], std=data_config['std'], + aug=train_aug_cfg, + ) + + # if using PyTorch XLA and RandomErasing is enabled, we must normalize and do RE in transforms on CPU + normalize_in_transform = dev_env.type_xla and args.reprob > 0 + + dataset_train.transform = create_transform_v2( + cfg=train_pp_cfg, is_training=True, normalize=normalize_in_transform) + + loader_train = create_loader_v2( + dataset_train, + batch_size=args.batch_size, + is_training=True, + normalize=not normalize_in_transform, + pp_cfg=train_pp_cfg, + mix_cfg=mixup_cfg, num_workers=args.workers, - collate_fn=collate_fn, pin_memory=args.pin_mem, use_multi_epochs_loader=args.use_multi_epochs_loader ) + eval_pp_cfg = PreprocessCfg( + input_size=data_config['input_size'], + interpolation=data_config['interpolation'], + crop_pct=data_config['crop_pct'], + mean=data_config['mean'], + std=data_config['std'], + ) + + dataset_eval.transform = create_transform_v2( + cfg=eval_pp_cfg, is_training=False, normalize=normalize_in_transform) + eval_workers = args.workers if 'tfds' in args.dataset: # FIXME reduce validation issues when using TFDS w/ workers and distributed training eval_workers = min(2, args.workers) - loader_eval = create_loader( + loader_eval = create_loader_v2( dataset_eval, - input_size=data_config['input_size'], batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, - interpolation=data_config['interpolation'], - mean=data_config['mean'], - std=data_config['std'], + normalize=not normalize_in_transform, + pp_cfg=eval_pp_cfg, num_workers=eval_workers, - crop_pct=data_config['crop_pct'], pin_memory=args.pin_mem, ) return data_config, loader_eval, loader_train @@ -700,8 +726,12 @@ def evaluate( loss = loss_fn(output, target) # FIXME, explictly marking step for XLA use since I'm not using the parallel xm loader - # need to investigate whether parallel loader wrapper is helpful on tpu-vm or only usefor for 2-vm setup. - dev_env.mark_step() + # need to investigate whether parallel loader wrapper is helpful on tpu-vm or only use for 2-vm setup. + if dev_env.type_xla: + dev_env.mark_step() + elif dev_env.type_cuda: + dev_env.synchronize() + tracker.mark_iter_step_end() losses_m.update(loss, output.size(0)) accuracy_m.update(output, target) diff --git a/validate.py b/validate.py index cee359c3..f4dc84e8 100755 --- a/validate.py +++ b/validate.py @@ -20,7 +20,8 @@ from collections import OrderedDict from timm.bits import initialize_device, Tracker, Monitor, AccuracyTopK, AvgTensor from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models -from timm.data import create_dataset, create_loader, resolve_data_config, RealLabelsImagenet +from timm.data import create_dataset, create_transform_v2, create_loader_v2, resolve_data_config, RealLabelsImagenet, \ + PreprocessCfg from timm.utils import natural_key, setup_default_logging @@ -141,18 +142,22 @@ def validate(args): else: real_labels = None - crop_pct = 1.0 if test_time_pool else data_config['crop_pct'] - loader = create_loader( - dataset, + eval_pp_cfg = PreprocessCfg( input_size=data_config['input_size'], - batch_size=args.batch_size, interpolation=data_config['interpolation'], + crop_pct=1.0 if test_time_pool else data_config['crop_pct'], mean=data_config['mean'], std=data_config['std'], + ) + + dataset.transform = create_transform_v2(cfg=eval_pp_cfg, is_training=False) + + loader = create_loader_v2( + dataset, + batch_size=args.batch_size, + pp_cfg=eval_pp_cfg, num_workers=args.workers, - crop_pct=crop_pct, - pin_memory=args.pin_mem, - tf_preprocessing=args.tf_preprocessing) + pin_memory=args.pin_mem) logger = Monitor(logger=_logger) tracker = Tracker() @@ -175,16 +180,17 @@ def validate(args): loss = criterion(output, target) if dev_env.type_cuda: - torch.cuda.synchronize() + dev_env.synchronize() tracker.mark_iter_step_end() - losses.update(loss.detach(), sample.size(0)) + if dev_env.type_xla: + dev_env.mark_step() + if real_labels is not None: real_labels.add_result(output) - accuracy.update(output.detach(), target) - if dev_env.type_xla: - dev_env.mark_step() + losses.update(loss.detach(), sample.size(0)) + accuracy.update(output.detach(), target) tracker.mark_iter() if step_idx % args.log_freq == 0: @@ -212,7 +218,7 @@ def validate(args): top5=round(top5a, 4), top5_err=round(100 - top5a, 4), param_count=round(param_count / 1e6, 2), img_size=data_config['input_size'][-1], - cropt_pct=crop_pct, + cropt_pct=eval_pp_cfg.crop_pct, interpolation=data_config['interpolation']) logger.log_phase(phase='eval', name_map={'top1': 'Acc@1', 'top5': 'Acc@5'}, **results)