pytorch-image-models/timm/data/readers/reader_hfds.py

""" Dataset reader that wraps Hugging Face datasets

Hacked together by / Copyright 2022 Ross Wightman
"""
import io
import math
import torch
import torch.distributed as dist
from PIL import Image

try:
    import datasets
except ImportError as e:
    print("Please install Hugging Face datasets package `pip install datasets`.")
    exit(1)
from .class_map import load_class_map
from .reader import Reader


def get_class_labels(info, label_key='label'):
    if 'label' not in info.features:
        return {}
    class_label = info.features[label_key]
    class_to_idx = {n: class_label.str2int(n) for n in class_label.names}
    return class_to_idx


class ReaderHfds(Reader):

    def __init__(
            self,
            root,
            name,
            split='train',
            class_map=None,
            label_key='label',
            download=False,
    ):
        """
        """
        super().__init__()
        self.root = root
        self.split = split
        self.dataset = datasets.load_dataset(
            name,  # 'name' maps to path arg in hf datasets
            split=split,
            cache_dir=self.root,  # timm doesn't expect hidden cache dir for datasets, specify a path
        )
        # leave decode for caller, plus we want easy access to original path names...
        self.dataset = self.dataset.cast_column('image', datasets.Image(decode=False))

        self.label_key = label_key
        self.remap_class = False
        if class_map:
            self.class_to_idx = load_class_map(class_map)
            self.remap_class = True
        else:
            self.class_to_idx = get_class_labels(self.dataset.info, self.label_key)
        self.split_info = self.dataset.info.splits[split]
        self.num_samples = self.split_info.num_examples

    def __getitem__(self, index):
        item = self.dataset[index]
        image = item['image']
        if 'bytes' in image and image['bytes']:
            image = io.BytesIO(image['bytes'])
        else:
            assert 'path' in image and image['path']
            image = open(image['path'], 'rb')
        label = item[self.label_key]
        if self.remap_class:
            label = self.class_to_idx[label]
        return image, label

    def __len__(self):
        return len(self.dataset)

    def _filename(self, index, basename=False, absolute=False):
        item = self.dataset[index]
        return item['image']['path']
Rename dataset/parsers -> dataset/readers, create_parser to create_reader, etc 2 years ago			`""" Dataset reader that wraps Hugging Face datasets`

Add initial Hugging Face Datasets parser impl. 2 years ago			`Hacked together by / Copyright 2022 Ross Wightman`
			`"""`
			`import io`
			`import math`
			`import torch`
			`import torch.distributed as dist`
			`from PIL import Image`

			`try:`
			`import datasets`
			`except ImportError as e:`
			print("Please install Hugging Face datasets package `pip install datasets`.")
			`exit(1)`
Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`from .class_map import load_class_map`
Rename dataset/parsers -> dataset/readers, create_parser to create_reader, etc 2 years ago			`from .reader import Reader`
Add initial Hugging Face Datasets parser impl. 2 years ago

Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`def get_class_labels(info, label_key='label'):`
Add initial Hugging Face Datasets parser impl. 2 years ago			`if 'label' not in info.features:`
			`return {}`
Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`class_label = info.features[label_key]`
Add initial Hugging Face Datasets parser impl. 2 years ago			`class_to_idx = {n: class_label.str2int(n) for n in class_label.names}`
			`return class_to_idx`


Rename dataset/parsers -> dataset/readers, create_parser to create_reader, etc 2 years ago			`class ReaderHfds(Reader):`
Add initial Hugging Face Datasets parser impl. 2 years ago
			`def __init__(`
			`self,`
			`root,`
			`name,`
			`split='train',`
			`class_map=None,`
Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`label_key='label',`
Add initial Hugging Face Datasets parser impl. 2 years ago			`download=False,`
			`):`
			`"""`
			`"""`
			`super().__init__()`
			`self.root = root`
			`self.split = split`
			`self.dataset = datasets.load_dataset(`
			`name, # 'name' maps to path arg in hf datasets`
			`split=split,`
			`cache_dir=self.root, # timm doesn't expect hidden cache dir for datasets, specify a path`
			`)`
			`# leave decode for caller, plus we want easy access to original path names...`
			`self.dataset = self.dataset.cast_column('image', datasets.Image(decode=False))`

Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`self.label_key = label_key`
			`self.remap_class = False`
			`if class_map:`
			`self.class_to_idx = load_class_map(class_map)`
			`self.remap_class = True`
			`else:`
			`self.class_to_idx = get_class_labels(self.dataset.info, self.label_key)`
Add initial Hugging Face Datasets parser impl. 2 years ago			`self.split_info = self.dataset.info.splits[split]`
Set num_workers in Iterable WDS/TFDS datasets early so sample estimate is correct 2 years ago			`self.num_samples = self.split_info.num_examples`
Add initial Hugging Face Datasets parser impl. 2 years ago
			`def __getitem__(self, index):`
			`item = self.dataset[index]`
			`image = item['image']`
			`if 'bytes' in image and image['bytes']:`
			`image = io.BytesIO(image['bytes'])`
			`else:`
			`assert 'path' in image and image['path']`
			`image = open(image['path'], 'rb')`
Support HF datasets and TFSD w/ a sub-path by fixing split, fix #1598 ... add class mapping support to HF datasets in case class label isn't in info. 2 years ago			`label = item[self.label_key]`
			`if self.remap_class:`
			`label = self.class_to_idx[label]`
			`return image, label`
Add initial Hugging Face Datasets parser impl. 2 years ago
			`def __len__(self):`
			`return len(self.dataset)`

			`def _filename(self, index, basename=False, absolute=False):`
			`item = self.dataset[index]`
			`return item['image']['path']`