diff --git a/timm/optim/__init__.py b/timm/optim/__init__.py index 994b36d2..ef4a0aec 100644 --- a/timm/optim/__init__.py +++ b/timm/optim/__init__.py @@ -5,4 +5,6 @@ from .radam import RAdam from .novograd import NovoGrad from .nvnovograd import NvNovoGrad from .lookahead import Lookahead +from .adamp import AdamP +from .sgdp import SGDP from .optim_factory import create_optimizer diff --git a/timm/optim/adamp.py b/timm/optim/adamp.py new file mode 100644 index 00000000..468c3e86 --- /dev/null +++ b/timm/optim/adamp.py @@ -0,0 +1,107 @@ +""" +AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py + +Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 +Code: https://github.com/clovaai/AdamP + +Copyright (c) 2020-present NAVER Corp. +MIT license +""" + +import torch +import torch.nn as nn +from torch.optim.optimizer import Optimizer, required +import math + +class AdamP(Optimizer): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False): + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + delta=delta, wd_ratio=wd_ratio, nesterov=nesterov) + super(AdamP, self).__init__(params, defaults) + + def _channel_view(self, x): + return x.view(x.size(0), -1) + + def _layer_view(self, x): + return x.view(1, -1) + + def _cosine_similarity(self, x, y, eps, view_func): + x = view_func(x) + y = view_func(y) + + x_norm = x.norm(dim=1).add_(eps) + y_norm = y.norm(dim=1).add_(eps) + dot = (x * y).sum(dim=1) + + return dot.abs() / x_norm / y_norm + + def _projection(self, p, grad, perturb, delta, wd_ratio, eps): + wd = 1 + expand_size = [-1] + [1] * (len(p.shape) - 1) + for view_func in [self._channel_view, self._layer_view]: + + cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func) + + if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)): + p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps) + perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size) + wd = wd_ratio + + return perturb, wd + + return perturb, wd + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad.data + beta1, beta2 = group['betas'] + nesterov = group['nesterov'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p.data) + state['exp_avg_sq'] = torch.zeros_like(p.data) + + # Adam + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + exp_avg.mul_(beta1).add_(1 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + step_size = group['lr'] / bias_correction1 + + if nesterov: + perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom + else: + perturb = exp_avg / denom + + # Projection + wd_ratio = 1 + if len(p.shape) > 1: + perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps']) + + # Weight decay + if group['weight_decay'] > 0: + p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio) + + # Step + p.data.add_(-step_size, perturb) + + return loss diff --git a/timm/optim/optim_factory.py b/timm/optim/optim_factory.py index d97887d5..c6565daa 100644 --- a/timm/optim/optim_factory.py +++ b/timm/optim/optim_factory.py @@ -1,6 +1,6 @@ import torch from torch import optim as optim -from timm.optim import Nadam, RMSpropTF, AdamW, RAdam, NovoGrad, NvNovoGrad, Lookahead +from timm.optim import Nadam, RMSpropTF, AdamW, RAdam, NovoGrad, NvNovoGrad, Lookahead, AdamP, SGDP try: from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD has_apex = True @@ -60,6 +60,14 @@ def create_optimizer(args, model, filter_bias_and_bn=True): elif opt_lower == 'radam': optimizer = RAdam( parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) + elif opt_lower == 'adamp': + optimizer = AdamP( + parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps, + delta=0.1, wd_ratio=0.01, nesterov=True) + elif opt_lower == 'sgdp': + optimizer = SGDP( + parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, + eps=args.opt_eps, nesterov=True) elif opt_lower == 'adadelta': optimizer = optim.Adadelta( parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) diff --git a/timm/optim/sgdp.py b/timm/optim/sgdp.py new file mode 100644 index 00000000..f4a94aa3 --- /dev/null +++ b/timm/optim/sgdp.py @@ -0,0 +1,96 @@ +""" +SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py + +Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 +Code: https://github.com/clovaai/AdamP + +Copyright (c) 2020-present NAVER Corp. +MIT license +""" + +import torch +import torch.nn as nn +from torch.optim.optimizer import Optimizer, required +import math + +class SGDP(Optimizer): + def __init__(self, params, lr=required, momentum=0, dampening=0, + weight_decay=0, nesterov=False, eps=1e-8, delta=0.1, wd_ratio=0.1): + defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, + nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio) + super(SGDP, self).__init__(params, defaults) + + def _channel_view(self, x): + return x.view(x.size(0), -1) + + def _layer_view(self, x): + return x.view(1, -1) + + def _cosine_similarity(self, x, y, eps, view_func): + x = view_func(x) + y = view_func(y) + + x_norm = x.norm(dim=1).add_(eps) + y_norm = y.norm(dim=1).add_(eps) + dot = (x * y).sum(dim=1) + + return dot.abs() / x_norm / y_norm + + def _projection(self, p, grad, perturb, delta, wd_ratio, eps): + wd = 1 + expand_size = [-1] + [1] * (len(p.shape) - 1) + for view_func in [self._channel_view, self._layer_view]: + + cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func) + + if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)): + p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps) + perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size) + wd = wd_ratio + + return perturb, wd + + return perturb, wd + + def step(self, closure=None): + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + weight_decay = group['weight_decay'] + momentum = group['momentum'] + dampening = group['dampening'] + nesterov = group['nesterov'] + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + state = self.state[p] + + # State initialization + if len(state) == 0: + state['momentum'] = torch.zeros_like(p.data) + + # SGD + buf = state['momentum'] + buf.mul_(momentum).add_(1 - dampening, grad) + if nesterov: + d_p = grad + momentum * buf + else: + d_p = buf + + # Projection + wd_ratio = 1 + if len(p.shape) > 1: + d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps']) + + # Weight decay + if weight_decay != 0: + p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum)) + + # Step + p.data.add_(-group['lr'], d_p) + + return loss