pytorch-image-models/timm/data/transforms_factory.py

""" Transforms Factory
Factory methods for building image transforms for use with TIMM (PyTorch Image Models)

Hacked together by / Copyright 2020 Ross Wightman
"""
import math
from typing import Union, Tuple

import torch
from torchvision import transforms

from timm.data.auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform
from timm.data.config import PreprocessCfg, AugCfg
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT
from timm.data.random_erasing import RandomErasing
from timm.data.transforms import RandomResizedCropAndInterpolation, ToNumpy, ToTensorNormalize


def transforms_noaug_train(
        img_size: Union[int, Tuple[int]] = 224,
        interpolation='bilinear',
        mean=IMAGENET_DEFAULT_MEAN,
        std=IMAGENET_DEFAULT_STD,
        normalize=False,
        compose=True,
):
    if interpolation == 'random':
        # random interpolation not supported with no-aug
        interpolation = 'bilinear'
    tfl = [
        transforms.Resize(img_size, transforms.InterpolationMode(interpolation)),
        transforms.CenterCrop(img_size)
    ]
    if normalize:
        tfl += [
            transforms.ToTensor(),
            transforms.Normalize(mean=torch.tensor(mean), std=torch.tensor(std))
        ]
    else:
        # (pre)fetcher and collate will handle tensor conversion and normalize
        tfl += [ToNumpy()]
    return transforms.Compose(tfl) if compose else tfl


def transforms_imagenet_train(
        img_size: Union[int, Tuple[int]] = 224,
        interpolation='random',
        mean=IMAGENET_DEFAULT_MEAN,
        std=IMAGENET_DEFAULT_STD,
        aug_cfg=AugCfg(),
        normalize=False,
        separate=False,
        compose=True,
):
    """
    If separate==True, the transforms are returned as a tuple of 3 separate transforms
    for use in a mixing dataset that passes
     * all data through the first (primary) transform, called the 'clean' data
     * a portion of the data through the secondary transform
     * normalizes and converts the branches above with the third, final transform
    """
    scale_range = tuple(aug_cfg.scale_range or (0.08, 1.0))  # default imagenet scale range
    ratio_range = tuple(aug_cfg.ratio_range or (3. / 4., 4. / 3.))  # default imagenet ratio range

    # 'primary' train transforms include random resize + crop w/ optional horizontal and vertical flipping aug.
    # This is the core of standard ImageNet ResNet and Inception pre-processing
    primary_tfl = [
        RandomResizedCropAndInterpolation(img_size, scale=scale_range, ratio=ratio_range, interpolation=interpolation)]
    if aug_cfg.hflip_prob > 0.:
        primary_tfl += [transforms.RandomHorizontalFlip(p=aug_cfg.hflip_prob)]
    if aug_cfg.vflip_prob > 0.:
        primary_tfl += [transforms.RandomVerticalFlip(p=aug_cfg.vflip_prob)]

    # 'secondary' transform stage includes either color jitter (could add lighting too) or auto-augmentations
    # such as AutoAugment, RandAugment, AugMix, etc
    secondary_tfl = []
    if aug_cfg.auto_augment:
        aa = aug_cfg.auto_augment
        assert isinstance(aa, str)
        if isinstance(img_size, (tuple, list)):
            img_size_min = min(img_size)
        else:
            img_size_min = img_size
        aa_params = dict(
            translate_const=int(img_size_min * 0.45),
            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
        )
        if interpolation and interpolation != 'random':
            aa_params['interpolation'] = interpolation
        if aa.startswith('rand'):
            secondary_tfl += [rand_augment_transform(aa, aa_params)]
        elif aa.startswith('augmix'):
            aa_params['translate_pct'] = 0.3
            secondary_tfl += [augment_and_mix_transform(aa, aa_params)]
        else:
            secondary_tfl += [auto_augment_transform(aa, aa_params)]
    elif aug_cfg.color_jitter is not None:
        # color jitter is enabled when not using AA
        cj = aug_cfg.color_jitter
        if isinstance(cj, (list, tuple)):
            # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation
            # or 4 if also augmenting hue
            assert len(cj) in (3, 4)
        else:
            # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue
            cj = (float(cj),) * 3
        secondary_tfl += [transforms.ColorJitter(*cj)]

    # 'final' transform stage includes normalization, followed by optional random erasing and tensor conversion
    final_tfl = []
    if normalize:
        final_tfl += [
            ToTensorNormalize(mean=mean, std=std)
        ]
        if aug_cfg.re_prob > 0.:
            final_tfl.append(RandomErasing(
                aug_cfg.re_prob,
                mode=aug_cfg.re_mode,
                count=aug_cfg.re_count,
                num_splits=aug_cfg.num_aug_splits))
    else:
        # when normalize disabled, (pre)fetcher and collate will handle tensor conversion and normalize
        final_tfl += [ToNumpy()]

    if separate:
        # return each transform stage separately
        if compose:
            return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl)
        else:
            return primary_tfl, secondary_tfl, final_tfl
    else:
        tfl = primary_tfl + secondary_tfl + final_tfl
        return transforms.Compose(tfl) if compose else tfl


def transforms_imagenet_eval(
        img_size: Union[int, Tuple[int]] = 224,
        crop_pct=None,
        interpolation='bilinear',
        mean=IMAGENET_DEFAULT_MEAN,
        std=IMAGENET_DEFAULT_STD,
        normalize=False,
        compose=True,
):
    crop_pct = crop_pct or DEFAULT_CROP_PCT

    if isinstance(img_size, (tuple, list)):
        assert len(img_size) == 2
        if img_size[-1] == img_size[-2]:
            # FIXME handle case where img is square and we want non aspect preserving resize
            # fall-back to older behaviour so Resize scales to shortest edge if target is square
            scale_size = int(math.floor(img_size[0] / crop_pct))
        else:
            scale_size = tuple([int(x / crop_pct) for x in img_size])
    else:
        scale_size = int(math.floor(img_size / crop_pct))

    tfl = [
        transforms.Resize(scale_size, transforms.InterpolationMode(interpolation)),
        transforms.CenterCrop(img_size),
    ]
    if normalize:
        tfl += [
            ToTensorNormalize(mean=mean, std=std)
        ]
    else:
        # (pre)fetcher and collate will handle tensor conversion and normalize
        tfl += [ToNumpy()]

    return transforms.Compose(tfl) if compose else tfl


def create_transform_v2(
        cfg=PreprocessCfg(),
        is_training=False,
        normalize=False,
        separate=False,
        compose=True,
        tf_preprocessing=False,
):
    """

    Args:
        cfg: Pre-processing configuration
        is_training (bool): Create transform for training pre-processing
        normalize (bool): Enable normalization in transforms (otherwise handled by fetcher/pre-fetcher)
        separate (bool): Return transforms separated into stages (for train)
        compose (bool): Wrap transforms in transform.Compose(), returns list otherwise
        tf_preprocessing (bool): Use Tensorflow pre-processing (for validation)
    Returns:

    """
    input_size = cfg.input_size
    if isinstance(input_size, (tuple, list)):
        img_size = input_size[-2:]
    else:
        img_size = input_size

    if tf_preprocessing:
        assert not normalize, "Expecting normalization to be handled in (pre)fetcher w/ TF preprocessing"
        assert not separate, "Separate transforms not supported for TF preprocessing"
        from timm.data.tf_preprocessing import TfPreprocessTransform
        transform = TfPreprocessTransform(
            is_training=is_training, size=img_size, interpolation=cfg.interpolation)
    else:
        if is_training and cfg.aug is None:
            assert not separate, "Cannot perform split augmentation with no_aug"
            transform = transforms_noaug_train(
                img_size,
                interpolation=cfg.interpolation,
                normalize=normalize,
                mean=cfg.mean,
                std=cfg.std,
                compose=compose,
            )
        elif is_training:
            transform = transforms_imagenet_train(
                img_size,
                interpolation=cfg.interpolation,
                mean=cfg.mean,
                std=cfg.std,
                aug_cfg=cfg.aug,
                normalize=normalize,
                separate=separate,
                compose=compose,
            )
        else:
            assert not separate, "Separate transforms not supported for validation preprocessing"
            transform = transforms_imagenet_eval(
                img_size,
                interpolation=cfg.interpolation,
                crop_pct=cfg.crop_pct,
                mean=cfg.mean,
                std=cfg.std,
                normalize=normalize,
                compose=compose,
            )

    return transform


def create_transform(
        input_size,
        is_training=False,
        use_prefetcher=False,
        no_aug=False,
        scale=None,
        ratio=None,
        hflip=0.5,
        vflip=0.,
        color_jitter=0.4,
        auto_augment=None,
        interpolation='bilinear',
        mean=IMAGENET_DEFAULT_MEAN,
        std=IMAGENET_DEFAULT_STD,
        re_prob=0.,
        re_mode='const',
        re_count=1,
        re_num_splits=0,
        crop_pct=None,
        tf_preprocessing=False,
        separate=False):

    if isinstance(input_size, (tuple, list)):
        img_size = input_size[-2:]
    else:
        img_size = input_size

    normalize_in_transform = not use_prefetcher
    if tf_preprocessing and use_prefetcher:
        assert not separate, "Separate transforms not supported for TF preprocessing"
        from timm.data.tf_preprocessing import TfPreprocessTransform
        transform = TfPreprocessTransform(
            is_training=is_training, size=img_size, interpolation=interpolation)
    else:
        if is_training and no_aug:
            assert not separate, "Cannot perform split augmentation with no_aug"
            transform = transforms_noaug_train(
                img_size,
                interpolation=interpolation,
                mean=mean,
                std=std,
                normalize=normalize_in_transform,
            )
        elif is_training:
            aug_cfg = AugCfg(
                scale_range=scale,
                ratio_range=ratio,
                hflip_prob=hflip,
                vflip_prob=vflip,
                color_jitter=color_jitter,
                auto_augment=auto_augment,
                re_prob=re_prob,
                re_mode=re_mode,
                re_count=re_count,
                num_aug_splits=re_num_splits,
            )
            transform = transforms_imagenet_train(
                img_size,
                interpolation=interpolation,
                mean=mean,
                std=std,
                aug_cfg=aug_cfg,
                normalize=normalize_in_transform,
                separate=separate
            )
        else:
            assert not separate, "Separate transforms not supported for validation pre-processing"
            transform = transforms_imagenet_eval(
                img_size,
                interpolation=interpolation,
                mean=mean,
                std=std,
                crop_pct=crop_pct,
                normalize=normalize_in_transform,
            )

    return transform