|
|
@ -236,6 +236,8 @@ parser.add_argument('--log-interval', type=int, default=50, metavar='N',
|
|
|
|
help='how many batches to wait before logging training status')
|
|
|
|
help='how many batches to wait before logging training status')
|
|
|
|
parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
|
|
|
|
parser.add_argument('--recovery-interval', type=int, default=0, metavar='N',
|
|
|
|
help='how many batches to wait before writing recovery checkpoint')
|
|
|
|
help='how many batches to wait before writing recovery checkpoint')
|
|
|
|
|
|
|
|
parser.add_argument('--checkpoint-hist', type=int, default=10, metavar='N',
|
|
|
|
|
|
|
|
help='number of checkpoints to keep (default: 10)')
|
|
|
|
parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
|
|
|
|
parser.add_argument('-j', '--workers', type=int, default=4, metavar='N',
|
|
|
|
help='how many training processes to use (default: 1)')
|
|
|
|
help='how many training processes to use (default: 1)')
|
|
|
|
parser.add_argument('--save-images', action='store_true', default=False,
|
|
|
|
parser.add_argument('--save-images', action='store_true', default=False,
|
|
|
@ -547,7 +549,7 @@ def main():
|
|
|
|
decreasing = True if eval_metric == 'loss' else False
|
|
|
|
decreasing = True if eval_metric == 'loss' else False
|
|
|
|
saver = CheckpointSaver(
|
|
|
|
saver = CheckpointSaver(
|
|
|
|
model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
|
|
|
|
model=model, optimizer=optimizer, args=args, model_ema=model_ema, amp_scaler=loss_scaler,
|
|
|
|
checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing)
|
|
|
|
checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=decreasing, max_history=args.checkpoint_hist)
|
|
|
|
with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
|
|
|
|
with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
|
|
|
|
f.write(args_text)
|
|
|
|
f.write(args_text)
|
|
|
|
|
|
|
|
|
|
|
|