Add worker_seeding arg to allow selecting old vs updated data loader worker seed for (old) experiment repeatability

pull/880/head
Ross Wightman 3 years ago
parent 6478bcd02c
commit 80075b0b8a

@ -3,8 +3,11 @@
Prefetcher and Fast Collate inspired by NVIDIA APEX example at
https://github.com/NVIDIA/apex/commit/d5e2bb4bdeedd27b1dfaf5bb2b24d6c000dee9be#diff-cf86c282ff7fba81fad27a559379d5bf
Hacked together by / Copyright 2020 Ross Wightman
Hacked together by / Copyright 2021 Ross Wightman
"""
import random
from functools import partial
from typing import Callable
import torch.utils.data
import numpy as np
@ -125,10 +128,20 @@ class PrefetchLoader:
self.loader.collate_fn.mixup_enabled = x
def _worker_init(worker_id):
def _worker_init(worker_id, worker_seeding='all'):
worker_info = torch.utils.data.get_worker_info()
assert worker_info.id == worker_id
np.random.seed(worker_info.seed % (2**32-1))
if isinstance(worker_seeding, Callable):
seed = worker_seeding(worker_info)
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed % (2 ** 32 - 1))
else:
assert worker_seeding in ('all', 'part')
# random / torch seed already called in dataloader iter class w/ worker_info.seed
# to reproduce some old results (same seed + hparam combo), partial seeding is required (skip numpy re-seed)
if worker_seeding == 'all':
np.random.seed(worker_info.seed % (2 ** 32 - 1))
def create_loader(
@ -162,6 +175,7 @@ def create_loader(
tf_preprocessing=False,
use_multi_epochs_loader=False,
persistent_workers=True,
worker_seeding='all',
):
re_num_splits = 0
if re_split:
@ -219,8 +233,9 @@ def create_loader(
collate_fn=collate_fn,
pin_memory=pin_memory,
drop_last=is_training,
worker_init_fn=_worker_init,
persistent_workers=persistent_workers)
worker_init_fn=partial(_worker_init, worker_seeding=worker_seeding),
persistent_workers=persistent_workers
)
try:
loader = loader_class(dataset, **loader_args)
except TypeError as e:

@ -252,6 +252,8 @@ parser.add_argument('--model-ema-decay', type=float, default=0.9998,
# Misc
parser.add_argument('--seed', type=int, default=42, metavar='S',
help='random seed (default: 42)')
parser.add_argument('--worker-seeding', type=str, default='all',
help='worker seed mode (default: all)')
parser.add_argument('--log-interval', type=int, default=50, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
@ -535,7 +537,8 @@ def main():
distributed=args.distributed,
collate_fn=collate_fn,
pin_memory=args.pin_mem,
use_multi_epochs_loader=args.use_multi_epochs_loader
use_multi_epochs_loader=args.use_multi_epochs_loader,
worker_seeding=args.worker_seeding,
)
loader_eval = create_loader(

Loading…
Cancel
Save