You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
6.7 KiB
185 lines
6.7 KiB
""" Dataset Factory
|
|
|
|
Hacked together by / Copyright 2021, Ross Wightman
|
|
"""
|
|
import os
|
|
|
|
from torchvision.datasets import CIFAR100, CIFAR10, MNIST, KMNIST, FashionMNIST, ImageFolder
|
|
try:
|
|
from torchvision.datasets import Places365
|
|
has_places365 = True
|
|
except ImportError:
|
|
has_places365 = False
|
|
try:
|
|
from torchvision.datasets import INaturalist
|
|
has_inaturalist = True
|
|
except ImportError:
|
|
has_inaturalist = False
|
|
try:
|
|
from torchvision.datasets import QMNIST
|
|
has_qmnist = True
|
|
except ImportError:
|
|
has_qmnist = False
|
|
try:
|
|
from torchvision.datasets import ImageNet
|
|
has_imagenet = True
|
|
except ImportError:
|
|
has_imagenet = False
|
|
|
|
from .dataset import IterableImageDataset, ImageDataset
|
|
|
|
_TORCH_BASIC_DS = dict(
|
|
cifar10=CIFAR10,
|
|
cifar100=CIFAR100,
|
|
mnist=MNIST,
|
|
kmnist=KMNIST,
|
|
fashion_mnist=FashionMNIST,
|
|
)
|
|
_TRAIN_SYNONYM = dict(train=None, training=None)
|
|
_EVAL_SYNONYM = dict(val=None, valid=None, validation=None, eval=None, evaluation=None)
|
|
|
|
|
|
def _search_split(root, split):
|
|
# look for sub-folder with name of split in root and use that if it exists
|
|
split_name = split.split('[')[0]
|
|
try_root = os.path.join(root, split_name)
|
|
if os.path.exists(try_root):
|
|
return try_root
|
|
|
|
def _try(syn):
|
|
for s in syn:
|
|
try_root = os.path.join(root, s)
|
|
if os.path.exists(try_root):
|
|
return try_root
|
|
return root
|
|
if split_name in _TRAIN_SYNONYM:
|
|
root = _try(_TRAIN_SYNONYM)
|
|
elif split_name in _EVAL_SYNONYM:
|
|
root = _try(_EVAL_SYNONYM)
|
|
return root
|
|
|
|
|
|
def create_dataset(
|
|
name,
|
|
root,
|
|
split='validation',
|
|
search_split=True,
|
|
class_map=None,
|
|
load_bytes=False,
|
|
is_training=False,
|
|
download=False,
|
|
batch_size=None,
|
|
seed=42,
|
|
repeats=0,
|
|
**kwargs
|
|
):
|
|
""" Dataset factory method
|
|
|
|
In parenthesis after each arg are the type of dataset supported for each arg, one of:
|
|
* folder - default, timm folder (or tar) based ImageDataset
|
|
* torch - torchvision based datasets
|
|
* HFDS - Hugging Face Datasets
|
|
* TFDS - Tensorflow-datasets wrapper in IterabeDataset interface via IterableImageDataset
|
|
* WDS - Webdataset
|
|
* all - any of the above
|
|
|
|
Args:
|
|
name: dataset name, empty is okay for folder based datasets
|
|
root: root folder of dataset (all)
|
|
split: dataset split (all)
|
|
search_split: search for split specific child fold from root so one can specify
|
|
`imagenet/` instead of `/imagenet/val`, etc on cmd line / config. (folder, torch/folder)
|
|
class_map: specify class -> index mapping via text file or dict (folder)
|
|
load_bytes: load data, return images as undecoded bytes (folder)
|
|
download: download dataset if not present and supported (HFDS, TFDS, torch)
|
|
is_training: create dataset in train mode, this is different from the split.
|
|
For Iterable / TDFS it enables shuffle, ignored for other datasets. (TFDS, WDS)
|
|
batch_size: batch size hint for (TFDS, WDS)
|
|
seed: seed for iterable datasets (TFDS, WDS)
|
|
repeats: dataset repeats per iteration i.e. epoch (TFDS, WDS)
|
|
**kwargs: other args to pass to dataset
|
|
|
|
Returns:
|
|
Dataset object
|
|
"""
|
|
name = name.lower()
|
|
if name.startswith('torch/'):
|
|
name = name.split('/', 2)[-1]
|
|
torch_kwargs = dict(root=root, download=download, **kwargs)
|
|
if name in _TORCH_BASIC_DS:
|
|
ds_class = _TORCH_BASIC_DS[name]
|
|
use_train = split in _TRAIN_SYNONYM
|
|
ds = ds_class(train=use_train, **torch_kwargs)
|
|
elif name == 'inaturalist' or name == 'inat':
|
|
assert has_inaturalist, 'Please update to PyTorch 1.10, torchvision 0.11+ for Inaturalist'
|
|
target_type = 'full'
|
|
split_split = split.split('/')
|
|
if len(split_split) > 1:
|
|
target_type = split_split[0].split('_')
|
|
if len(target_type) == 1:
|
|
target_type = target_type[0]
|
|
split = split_split[-1]
|
|
if split in _TRAIN_SYNONYM:
|
|
split = '2021_train'
|
|
elif split in _EVAL_SYNONYM:
|
|
split = '2021_valid'
|
|
ds = INaturalist(version=split, target_type=target_type, **torch_kwargs)
|
|
elif name == 'places365':
|
|
assert has_places365, 'Please update to a newer PyTorch and torchvision for Places365 dataset.'
|
|
if split in _TRAIN_SYNONYM:
|
|
split = 'train-standard'
|
|
elif split in _EVAL_SYNONYM:
|
|
split = 'val'
|
|
ds = Places365(split=split, **torch_kwargs)
|
|
elif name == 'qmnist':
|
|
assert has_qmnist, 'Please update to a newer PyTorch and torchvision for QMNIST dataset.'
|
|
use_train = split in _TRAIN_SYNONYM
|
|
ds = QMNIST(train=use_train, **torch_kwargs)
|
|
elif name == 'imagenet':
|
|
assert has_imagenet, 'Please update to a newer PyTorch and torchvision for ImageNet dataset.'
|
|
if split in _EVAL_SYNONYM:
|
|
split = 'val'
|
|
ds = ImageNet(split=split, **torch_kwargs)
|
|
elif name == 'image_folder' or name == 'folder':
|
|
# in case torchvision ImageFolder is preferred over timm ImageDataset for some reason
|
|
if search_split and os.path.isdir(root):
|
|
# look for split specific sub-folder in root
|
|
root = _search_split(root, split)
|
|
ds = ImageFolder(root, **kwargs)
|
|
else:
|
|
assert False, f"Unknown torchvision dataset {name}"
|
|
elif name.startswith('hfds/'):
|
|
# NOTE right now, HF datasets default arrow format is a random-access Dataset,
|
|
# There will be a IterableDataset variant too, TBD
|
|
ds = ImageDataset(root, reader=name, split=split, **kwargs)
|
|
elif name.startswith('tfds/'):
|
|
ds = IterableImageDataset(
|
|
root,
|
|
reader=name,
|
|
split=split,
|
|
is_training=is_training,
|
|
download=download,
|
|
batch_size=batch_size,
|
|
repeats=repeats,
|
|
seed=seed,
|
|
**kwargs
|
|
)
|
|
elif name.startswith('wds/'):
|
|
ds = IterableImageDataset(
|
|
root,
|
|
reader=name,
|
|
split=split,
|
|
is_training=is_training,
|
|
batch_size=batch_size,
|
|
repeats=repeats,
|
|
seed=seed,
|
|
**kwargs
|
|
)
|
|
else:
|
|
# FIXME support more advance split cfg for ImageFolder/Tar datasets in the future
|
|
if search_split and os.path.isdir(root):
|
|
# look for split specific sub-folder in root
|
|
root = _search_split(root, split)
|
|
ds = ImageDataset(root, reader=name, class_map=class_map, load_bytes=load_bytes, **kwargs)
|
|
return ds
|