From 89147a91e6f6cdd30c6311b6c78a39767b7c2b9a Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Tue, 14 May 2019 18:29:34 -0700 Subject: [PATCH] Remove adabound optimizer, never got it working well on larger datasets --- optim/__init__.py | 3 +- optim/adabound.py | 121 ----------------------------------------- optim/optim_factory.py | 6 +- 3 files changed, 2 insertions(+), 128 deletions(-) delete mode 100644 optim/adabound.py diff --git a/optim/__init__.py b/optim/__init__.py index 2fdbcab9..c418c58a 100644 --- a/optim/__init__.py +++ b/optim/__init__.py @@ -1,4 +1,3 @@ -from optim.adabound import AdaBound from optim.nadam import Nadam from optim.rmsprop_tf import RMSpropTF -from optim.optim_factory import create_optimizer \ No newline at end of file +from optim.optim_factory import create_optimizer diff --git a/optim/adabound.py b/optim/adabound.py deleted file mode 100644 index 3ff2712c..00000000 --- a/optim/adabound.py +++ /dev/null @@ -1,121 +0,0 @@ -import math -import torch -from torch.optim import Optimizer - - -class AdaBound(Optimizer): - """Implements AdaBound algorithm. - It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. - Arguments: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): Adam learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - final_lr (float, optional): final (SGD) learning rate (default: 0.1) - gamma (float, optional): convergence speed of the bound functions (default: 1e-3) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm - .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: - https://openreview.net/forum?id=Bkg3g2R9FX - - Originally taken from https://github.com/Luolc/AdaBound - NOTE: Has not provided good (or even decent) results on large datasets like ImageNet - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), final_lr=0.1, gamma=1e-3, - eps=1e-8, weight_decay=0, amsbound=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= final_lr: - raise ValueError("Invalid final learning rate: {}".format(final_lr)) - if not 0.0 <= gamma < 1.0: - raise ValueError("Invalid gamma parameter: {}".format(gamma)) - defaults = dict(lr=lr, betas=betas, final_lr=final_lr, gamma=gamma, eps=eps, - weight_decay=weight_decay, amsbound=amsbound) - super(AdaBound, self).__init__(params, defaults) - - self.base_lrs = list(map(lambda group: group['lr'], self.param_groups)) - - def __setstate__(self, state): - super(AdaBound, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsbound', False) - - def step(self, closure=None): - """Performs a single optimization step. - Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - loss = closure() - - for group, base_lr in zip(self.param_groups, self.base_lrs): - for p in group['params']: - if p.grad is None: - continue - grad = p.grad.data - if grad.is_sparse: - raise RuntimeError( - 'Adam does not support sparse gradients, please consider SparseAdam instead') - amsbound = group['amsbound'] - - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = 0 - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p.data) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p.data) - if amsbound: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros_like(p.data) - - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - if amsbound: - max_exp_avg_sq = state['max_exp_avg_sq'] - beta1, beta2 = group['betas'] - - state['step'] += 1 - - if group['weight_decay'] != 0: - grad = grad.add(group['weight_decay'], p.data) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) - if amsbound: - # Maintains the maximum of all 2nd moment running avg. till now - torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) - # Use the max. for normalizing running avg. of gradient - denom = max_exp_avg_sq.sqrt().add_(group['eps']) - else: - denom = exp_avg_sq.sqrt().add_(group['eps']) - - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] - step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1 - - # Applies bounds on actual learning rate - # lr_scheduler cannot affect final_lr, this is a workaround to apply lr decay - final_lr = group['final_lr'] * group['lr'] / base_lr - lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1)) - upper_bound = final_lr * (1 + 1 / (group['gamma'] * state['step'])) - step_size = torch.full_like(denom, step_size) - step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg) - - p.data.add_(-step_size) - - return loss diff --git a/optim/optim_factory.py b/optim/optim_factory.py index c0f77cd9..efd246a5 100644 --- a/optim/optim_factory.py +++ b/optim/optim_factory.py @@ -1,5 +1,5 @@ from torch import optim as optim -from optim import Nadam, AdaBound, RMSpropTF +from optim import Nadam, RMSpropTF def add_weight_decay(model, weight_decay=1e-5, skip_list=()): @@ -35,10 +35,6 @@ def create_optimizer(args, model, filter_bias_and_bn=True): elif args.opt.lower() == 'nadam': optimizer = Nadam( parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) - elif args.opt.lower() == 'adabound': - optimizer = AdaBound( - parameters, lr=args.lr / 100, weight_decay=weight_decay, eps=args.opt_eps, - final_lr=args.lr) elif args.opt.lower() == 'adadelta': optimizer = optim.Adadelta( parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps)