From 0557c8257d0a7fcf02456a5ea7229476716f26d8 Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Mon, 28 Feb 2022 17:06:32 -0800 Subject: [PATCH] Fix bug introduced in non layer_decay weight_decay application. Remove debug print, fix arg desc. --- timm/models/helpers.py | 1 - timm/optim/optim_factory.py | 2 +- train.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/timm/models/helpers.py b/timm/models/helpers.py index dd45e5ea..96d3bd7f 100644 --- a/timm/models/helpers.py +++ b/timm/models/helpers.py @@ -660,7 +660,6 @@ def group_with_matcher( for k in sorted(filter(lambda x: x is not None, grouping.keys())): if lid < 0 or k[-1] != MATCH_PREV_GROUP[0]: lid += 1 - print(lid, k, grouping[k]) layer_id_to_param[lid].extend(grouping[k]) if reverse: diff --git a/timm/optim/optim_factory.py b/timm/optim/optim_factory.py index 842d18f9..06788d2e 100644 --- a/timm/optim/optim_factory.py +++ b/timm/optim/optim_factory.py @@ -44,7 +44,7 @@ def param_groups_weight_decay( if not param.requires_grad: continue - if param.ndim or name.endswith(".bias") or name in no_weight_decay_list: + if param.ndim <= 1 or name.endswith(".bias") or name in no_weight_decay_list: no_decay.append(param) else: decay.append(param) diff --git a/train.py b/train.py index ea127251..0efef787 100755 --- a/train.py +++ b/train.py @@ -140,7 +140,7 @@ parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', parser.add_argument('--clip-mode', type=str, default='norm', help='Gradient clipping mode. One of ("norm", "value", "agc")') parser.add_argument('--layer-decay', type=float, default=None, - help='weight decay (default: None)') + help='layer-wise learning rate decay (default: None)') # Learning rate schedule parameters parser.add_argument('--sched', default='cosine', type=str, metavar='SCHEDULER',