pytorch-image-models/timm/bits/device_env_xla.py

import os
from contextlib import suppress
from dataclasses import dataclass, field, InitVar
from typing import Optional

import torch
from torch.distributed import ReduceOp

try:
    import torch_xla.core.xla_model as xm
    _HAS_XLA = True
except ImportError as e:
    xm = None
    _HAS_XLA = False

try:
    # only the very latest XLA builds have AMP
    import torch_xla.amp as xa
except ImportError as e:
    xa = None

from .device_env import DeviceEnv, DeviceEnvType, TensorList


_PT_TO_XM_OP = {
    ReduceOp.SUM: 'sum',
    ReduceOp.PRODUCT: 'mul',
    ReduceOp.MIN: 'min',
    ReduceOp.MAX: 'max',
    ReduceOp.BAND: 'and',
    ReduceOp.BOR: 'or',
}


def is_xla_available(xla_device_type=None):
    if not _HAS_XLA:
        return False
    supported_devs = xm.get_xla_supported_devices(devkind=xla_device_type)
    return len(supported_devs) >= 1


@dataclass
class DeviceEnvXla(DeviceEnv):

    def __post_init__(self, device_type: Optional[str], device_idx: Optional[int]):
        if device_type is not None:
            device_type = device_type.upper()
            assert device_type in ('TPU', 'GPU', 'CPU'), "XLA device type must be one of ('TPU', 'GPU', 'CPU')"
        self.device = xm.xla_device(n=device_idx, devkind=device_type)
        self.world_size = xm.xrt_world_size()
        if self.distributed:
            assert device_idx is None, "device_index is based on local rank for distributed XLA mode"
            self.local_rank = xm.get_local_ordinal()
            self.global_rank = xm.get_ordinal()
        else:
            self.local_rank = 0
            self.global_rank = 0
        if self.amp:
            assert xa is not None, 'XLA AMP is not present on this build'
        if self.autocast is None:
            self.autocast = xa.autocast if self.amp else suppress

    @property
    def type(self) -> DeviceEnvType:
        return DeviceEnvType.XLA

    def wrap_distributed(self, *modules):
        wrapped = [m for m in modules]  # NO-OP
        return wrapped[0] if len(wrapped) == 1 else wrapped

    def wrap_parallel(self, *modules):
        assert False, "Not implemented"

    def mark_step(self):
        xm.mark_step()

    def all_reduce(self, tensor: torch.Tensor, op=ReduceOp.SUM, average=False):
        assert isinstance(tensor, torch.Tensor)  # unlike in-place variant, lists/tuples not allowed
        op = _PT_TO_XM_OP[op]
        scale = 1.0 / self.world_size if average else 1.0
        return xm.all_reduce(op, tensor, scale=scale)

    def all_reduce_(self, tensor: TensorList, op=ReduceOp.SUM, average=False):
        op = _PT_TO_XM_OP[op]
        scale = 1.0 / self.world_size if average else 1.0
        wrapped = False
        if isinstance(tensor, torch.Tensor):
            tensor = [tensor]  # bare tensors are not operated on in-place
            wrapped = True
        xm.all_reduce(op, tensor, scale=scale)
        if wrapped:
            tensor = tensor[0]
        return tensor

    def all_gather(self, tensor: torch.Tensor, cat_dim=0):
        output = xm.all_gather(tensor, cat_dim)
        return output

    def all_to_all(self, tensor, num_splits, split_dim, cat_dim=0):
        output = xm.all_to_all(tensor, split_dim, cat_dim, num_splits)
        return output

    def broadcast(self, tensor: torch.Tensor, src_rank=0):
        if self.global_rank != src_rank:
            reduce_tensor = torch.zeros_like(tensor)
            xm.all_reduce('sum', reduce_tensor)
        else:
            xm.all_reduce('sum', tensor)
        return tensor

    def broadcast_(self, tensor: torch.Tensor, src_rank=0):
        out_tensor = self.broadcast(tensor, src_rank)
        return tensor.copy_(out_tensor)

    def barrier(self):
        xm.rendezvous('timm.bits.dist_barrier')
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago			`import os`
			`from contextlib import suppress`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`from dataclasses import dataclass, field, InitVar`
			`from typing import Optional`

First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago			`import torch`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`from torch.distributed import ReduceOp`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago
			`try:`
			`import torch_xla.core.xla_model as xm`
			`_HAS_XLA = True`
			`except ImportError as e:`
			`xm = None`
			`_HAS_XLA = False`

Fix some bugs with XLA support, logger, add hacky xla dist launch script since torch.dist.launch doesn't work 4 years ago			`try:`
			`# only the very latest XLA builds have AMP`
			`import torch_xla.amp as xa`
			`except ImportError as e:`
			`xa = None`

Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`from .device_env import DeviceEnv, DeviceEnvType, TensorList`


			`_PT_TO_XM_OP = {`
Back to using strings for the enum translation, forgot about import dep 4 years ago			`ReduceOp.SUM: 'sum',`
			`ReduceOp.PRODUCT: 'mul',`
			`ReduceOp.MIN: 'min',`
			`ReduceOp.MAX: 'max',`
			`ReduceOp.BAND: 'and',`
			`ReduceOp.BOR: 'or',`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`}`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago

			`def is_xla_available(xla_device_type=None):`
			`if not _HAS_XLA:`
			`return False`
			`supported_devs = xm.get_xla_supported_devices(devkind=xla_device_type)`
			`return len(supported_devs) >= 1`


Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`@dataclass`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago			`class DeviceEnvXla(DeviceEnv):`

Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`def __post_init__(self, device_type: Optional[str], device_idx: Optional[int]):`
			`if device_type is not None:`
			`device_type = device_type.upper()`
			`assert device_type in ('TPU', 'GPU', 'CPU'), "XLA device type must be one of ('TPU', 'GPU', 'CPU')"`
			`self.device = xm.xla_device(n=device_idx, devkind=device_type)`
			`self.world_size = xm.xrt_world_size()`
			`if self.distributed:`
			`assert device_idx is None, "device_index is based on local rank for distributed XLA mode"`
			`self.local_rank = xm.get_local_ordinal()`
			`self.global_rank = xm.get_ordinal()`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago			`else:`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`self.local_rank = 0`
			`self.global_rank = 0`
			`if self.amp:`
			`assert xa is not None, 'XLA AMP is not present on this build'`
			`if self.autocast is None:`
			`self.autocast = xa.autocast if self.amp else suppress`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago
			`@property`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`def type(self) -> DeviceEnvType:`
			`return DeviceEnvType.XLA`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago
			`def wrap_distributed(self, *modules):`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`wrapped = [m for m in modules] # NO-OP`
Fix some bugs with XLA support, logger, add hacky xla dist launch script since torch.dist.launch doesn't work 4 years ago			`return wrapped[0] if len(wrapped) == 1 else wrapped`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`def wrap_parallel(self, *modules):`
			`assert False, "Not implemented"`
First timm.bits commit, add initial abstractions, WIP updates to train, val... some of it working 4 years ago
			`def mark_step(self):`
			`xm.mark_step()`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago
			`def all_reduce(self, tensor: torch.Tensor, op=ReduceOp.SUM, average=False):`
			`assert isinstance(tensor, torch.Tensor) # unlike in-place variant, lists/tuples not allowed`
			`op = _PT_TO_XM_OP[op]`
Fix model init for XLA, remove some prints. 4 years ago			`scale = 1.0 / self.world_size if average else 1.0`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`return xm.all_reduce(op, tensor, scale=scale)`

			`def all_reduce_(self, tensor: TensorList, op=ReduceOp.SUM, average=False):`
			`op = _PT_TO_XM_OP[op]`
Fix model init for XLA, remove some prints. 4 years ago			`scale = 1.0 / self.world_size if average else 1.0`
Major timm.bits update. Updater and DeviceEnv now dataclasses, after_step closure used, metrics base impl w/ distributed reduce, many tweaks/fixes. 4 years ago			`wrapped = False`
			`if isinstance(tensor, torch.Tensor):`
			`tensor = [tensor] # bare tensors are not operated on in-place`
			`wrapped = True`
			`xm.all_reduce(op, tensor, scale=scale)`
			`if wrapped:`
			`tensor = tensor[0]`
			`return tensor`

			`def all_gather(self, tensor: torch.Tensor, cat_dim=0):`
			`output = xm.all_gather(tensor, cat_dim)`
			`return output`

			`def all_to_all(self, tensor, num_splits, split_dim, cat_dim=0):`
			`output = xm.all_to_all(tensor, split_dim, cat_dim, num_splits)`
			`return output`

			`def broadcast(self, tensor: torch.Tensor, src_rank=0):`
			`if self.global_rank != src_rank:`
			`reduce_tensor = torch.zeros_like(tensor)`
			`xm.all_reduce('sum', reduce_tensor)`
			`else:`
			`xm.all_reduce('sum', tensor)`
			`return tensor`

			`def broadcast_(self, tensor: torch.Tensor, src_rank=0):`
			`out_tensor = self.broadcast(tensor, src_rank)`
			`return tensor.copy_(out_tensor)`

			`def barrier(self):`
			`xm.rendezvous('timm.bits.dist_barrier')`