Add `adamp` and 'sgdp' optimizers.

Update requirements.txt

Update optim_factory.py

Add `adamp` optimizer

Update __init__.py

copy files of adamp & sgdp

Create adamp.py

Update __init__.py

Create sgdp.py

Update optim_factory.py

Update optim_factory.py

Update requirements.txt

Update adamp.py

Update sgdp.py

Update sgdp.py

Update adamp.py
pull/208/head
Sangdoo Yun 5 years ago committed by Ross Wightman
parent 0915beddad
commit e93e571f7a

@ -5,4 +5,6 @@ from .radam import RAdam
from .novograd import NovoGrad
from .nvnovograd import NvNovoGrad
from .lookahead import Lookahead
from .adamp import AdamP
from .sgdp import SGDP
from .optim_factory import create_optimizer

@ -0,0 +1,107 @@
"""
AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py
Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
Code: https://github.com/clovaai/AdamP
Copyright (c) 2020-present NAVER Corp.
MIT license
"""
import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer, required
import math
class AdamP(Optimizer):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
super(AdamP, self).__init__(params, defaults)
def _channel_view(self, x):
return x.view(x.size(0), -1)
def _layer_view(self, x):
return x.view(1, -1)
def _cosine_similarity(self, x, y, eps, view_func):
x = view_func(x)
y = view_func(y)
x_norm = x.norm(dim=1).add_(eps)
y_norm = y.norm(dim=1).add_(eps)
dot = (x * y).sum(dim=1)
return dot.abs() / x_norm / y_norm
def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
wd = 1
expand_size = [-1] + [1] * (len(p.shape) - 1)
for view_func in [self._channel_view, self._layer_view]:
cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
wd = wd_ratio
return perturb, wd
return perturb, wd
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
beta1, beta2 = group['betas']
nesterov = group['nesterov']
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p.data)
state['exp_avg_sq'] = torch.zeros_like(p.data)
# Adam
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
state['step'] += 1
bias_correction1 = 1 - beta1 ** state['step']
bias_correction2 = 1 - beta2 ** state['step']
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
step_size = group['lr'] / bias_correction1
if nesterov:
perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
else:
perturb = exp_avg / denom
# Projection
wd_ratio = 1
if len(p.shape) > 1:
perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps'])
# Weight decay
if group['weight_decay'] > 0:
p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)
# Step
p.data.add_(-step_size, perturb)
return loss

@ -1,6 +1,6 @@
import torch
from torch import optim as optim
from timm.optim import Nadam, RMSpropTF, AdamW, RAdam, NovoGrad, NvNovoGrad, Lookahead
from timm.optim import Nadam, RMSpropTF, AdamW, RAdam, NovoGrad, NvNovoGrad, Lookahead, AdamP, SGDP
try:
from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD
has_apex = True
@ -60,6 +60,14 @@ def create_optimizer(args, model, filter_bias_and_bn=True):
elif opt_lower == 'radam':
optimizer = RAdam(
parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps)
elif opt_lower == 'adamp':
optimizer = AdamP(
parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps,
delta=0.1, wd_ratio=0.01, nesterov=True)
elif opt_lower == 'sgdp':
optimizer = SGDP(
parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay,
eps=args.opt_eps, nesterov=True)
elif opt_lower == 'adadelta':
optimizer = optim.Adadelta(
parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps)

@ -0,0 +1,96 @@
"""
SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py
Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217
Code: https://github.com/clovaai/AdamP
Copyright (c) 2020-present NAVER Corp.
MIT license
"""
import torch
import torch.nn as nn
from torch.optim.optimizer import Optimizer, required
import math
class SGDP(Optimizer):
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False, eps=1e-8, delta=0.1, wd_ratio=0.1):
defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay,
nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio)
super(SGDP, self).__init__(params, defaults)
def _channel_view(self, x):
return x.view(x.size(0), -1)
def _layer_view(self, x):
return x.view(1, -1)
def _cosine_similarity(self, x, y, eps, view_func):
x = view_func(x)
y = view_func(y)
x_norm = x.norm(dim=1).add_(eps)
y_norm = y.norm(dim=1).add_(eps)
dot = (x * y).sum(dim=1)
return dot.abs() / x_norm / y_norm
def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
wd = 1
expand_size = [-1] + [1] * (len(p.shape) - 1)
for view_func in [self._channel_view, self._layer_view]:
cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
wd = wd_ratio
return perturb, wd
return perturb, wd
def step(self, closure=None):
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
state = self.state[p]
# State initialization
if len(state) == 0:
state['momentum'] = torch.zeros_like(p.data)
# SGD
buf = state['momentum']
buf.mul_(momentum).add_(1 - dampening, grad)
if nesterov:
d_p = grad + momentum * buf
else:
d_p = buf
# Projection
wd_ratio = 1
if len(p.shape) > 1:
d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps'])
# Weight decay
if weight_decay != 0:
p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum))
# Step
p.data.add_(-group['lr'], d_p)
return loss
Loading…
Cancel
Save