|
|
|
""" Checkpoint Saver
|
|
|
|
|
|
|
|
Track top-n training checkpoints and maintain recovery checkpoints on specified intervals.
|
|
|
|
|
|
|
|
Hacked together by / Copyright 2020 Ross Wightman
|
|
|
|
"""
|
|
|
|
|
|
|
|
import glob
|
|
|
|
import operator
|
|
|
|
import os
|
|
|
|
import logging
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
from .model import unwrap_model, get_state_dict
|
|
|
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class CheckpointSaver:
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
model,
|
|
|
|
optimizer,
|
|
|
|
args=None,
|
|
|
|
model_ema=None,
|
|
|
|
amp_scaler=None,
|
|
|
|
checkpoint_prefix='checkpoint',
|
|
|
|
recovery_prefix='recovery',
|
|
|
|
checkpoint_dir='',
|
|
|
|
recovery_dir='',
|
|
|
|
decreasing=False,
|
|
|
|
max_history=10,
|
|
|
|
unwrap_fn=unwrap_model):
|
|
|
|
|
|
|
|
# objects to save state_dicts of
|
|
|
|
self.model = model
|
|
|
|
self.optimizer = optimizer
|
|
|
|
self.args = args
|
|
|
|
self.model_ema = model_ema
|
|
|
|
self.amp_scaler = amp_scaler
|
|
|
|
|
|
|
|
# state
|
|
|
|
self.checkpoint_files = [] # (filename, metric) tuples in order of decreasing betterness
|
|
|
|
self.best_epoch = None
|
|
|
|
self.best_metric = None
|
|
|
|
self.curr_recovery_file = ''
|
|
|
|
self.last_recovery_file = ''
|
|
|
|
|
|
|
|
# config
|
|
|
|
self.checkpoint_dir = checkpoint_dir
|
|
|
|
self.recovery_dir = recovery_dir
|
|
|
|
self.save_prefix = checkpoint_prefix
|
|
|
|
self.recovery_prefix = recovery_prefix
|
|
|
|
self.extension = '.pth.tar'
|
|
|
|
self.decreasing = decreasing # a lower metric is better if True
|
|
|
|
self.cmp = operator.lt if decreasing else operator.gt # True if lhs better than rhs
|
|
|
|
self.max_history = max_history
|
|
|
|
self.unwrap_fn = unwrap_fn
|
|
|
|
assert self.max_history >= 1
|
|
|
|
|
|
|
|
def save_checkpoint(self, epoch, metric=None):
|
|
|
|
assert epoch >= 0
|
|
|
|
tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension)
|
|
|
|
last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension)
|
|
|
|
self._save(tmp_save_path, epoch, metric)
|
|
|
|
if os.path.exists(last_save_path):
|
|
|
|
os.unlink(last_save_path) # required for Windows support.
|
|
|
|
os.rename(tmp_save_path, last_save_path)
|
|
|
|
worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
|
|
|
|
if (len(self.checkpoint_files) < self.max_history
|
|
|
|
or metric is None or self.cmp(metric, worst_file[1])):
|
|
|
|
if len(self.checkpoint_files) >= self.max_history:
|
|
|
|
self._cleanup_checkpoints(1)
|
|
|
|
filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension
|
|
|
|
save_path = os.path.join(self.checkpoint_dir, filename)
|
|
|
|
os.link(last_save_path, save_path)
|
|
|
|
self.checkpoint_files.append((save_path, metric))
|
|
|
|
self.checkpoint_files = sorted(
|
|
|
|
self.checkpoint_files, key=lambda x: x[1],
|
|
|
|
reverse=not self.decreasing) # sort in descending order if a lower metric is not better
|
|
|
|
|
|
|
|
checkpoints_str = "Current checkpoints:\n"
|
|
|
|
for c in self.checkpoint_files:
|
|
|
|
checkpoints_str += ' {}\n'.format(c)
|
|
|
|
_logger.info(checkpoints_str)
|
|
|
|
|
|
|
|
if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
|
|
|
|
self.best_epoch = epoch
|
|
|
|
self.best_metric = metric
|
|
|
|
best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension)
|
|
|
|
if os.path.exists(best_save_path):
|
|
|
|
os.unlink(best_save_path)
|
|
|
|
os.link(last_save_path, best_save_path)
|
|
|
|
|
|
|
|
return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
|
|
|
|
|
|
|
|
def _save(self, save_path, epoch, metric=None):
|
|
|
|
save_state = {
|
|
|
|
'epoch': epoch,
|
|
|
|
'arch': type(self.model).__name__.lower(),
|
|
|
|
'state_dict': get_state_dict(self.model, self.unwrap_fn),
|
|
|
|
'optimizer': self.optimizer.state_dict(),
|
|
|
|
'version': 2, # version < 2 increments epoch before save
|
|
|
|
}
|
|
|
|
if self.args is not None:
|
|
|
|
save_state['arch'] = self.args.model
|
|
|
|
save_state['args'] = self.args
|
|
|
|
if self.amp_scaler is not None:
|
|
|
|
amp_key = getattr(self.amp_scaler, 'state_dict_key', 'amp_scaler')
|
|
|
|
save_state[amp_key] = self.amp_scaler.state_dict()
|
|
|
|
if self.model_ema is not None:
|
|
|
|
save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn)
|
|
|
|
if metric is not None:
|
|
|
|
save_state['metric'] = metric
|
|
|
|
torch.save(save_state, save_path)
|
|
|
|
|
|
|
|
def _cleanup_checkpoints(self, trim=0):
|
|
|
|
trim = min(len(self.checkpoint_files), trim)
|
|
|
|
delete_index = self.max_history - trim
|
|
|
|
if delete_index < 0 or len(self.checkpoint_files) <= delete_index:
|
|
|
|
return
|
|
|
|
to_delete = self.checkpoint_files[delete_index:]
|
|
|
|
for d in to_delete:
|
|
|
|
try:
|
|
|
|
_logger.debug("Cleaning checkpoint: {}".format(d))
|
|
|
|
os.remove(d[0])
|
|
|
|
except Exception as e:
|
|
|
|
_logger.error("Exception '{}' while deleting checkpoint".format(e))
|
|
|
|
self.checkpoint_files = self.checkpoint_files[:delete_index]
|
|
|
|
|
|
|
|
def save_recovery(self, epoch, batch_idx=0):
|
|
|
|
assert epoch >= 0
|
|
|
|
filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension
|
|
|
|
save_path = os.path.join(self.recovery_dir, filename)
|
|
|
|
self._save(save_path, epoch)
|
|
|
|
if os.path.exists(self.last_recovery_file):
|
|
|
|
try:
|
|
|
|
_logger.debug("Cleaning recovery: {}".format(self.last_recovery_file))
|
|
|
|
os.remove(self.last_recovery_file)
|
|
|
|
except Exception as e:
|
|
|
|
_logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file))
|
|
|
|
self.last_recovery_file = self.curr_recovery_file
|
|
|
|
self.curr_recovery_file = save_path
|
|
|
|
|
|
|
|
def find_recovery(self):
|
|
|
|
recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix)
|
|
|
|
files = glob.glob(recovery_path + '*' + self.extension)
|
|
|
|
files = sorted(files)
|
|
|
|
return files[0] if len(files) else ''
|