From c0e6e5f3dbe8277c9afbfafcfe0b0404f7f69c2c Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sat, 6 Apr 2019 13:59:15 -0700
Subject: [PATCH 1/2] Add common model interface to pnasnet and xception,
 update factory

---
 data/transforms.py      |  6 ++---
 models/model_factory.py |  5 ++++
 models/pnasnet.py       | 53 ++++++++++++++++++++++-------------------
 models/resnext.py       |  3 ---
 models/xception.py      | 42 ++++++++++++++++----------------
 5 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/data/transforms.py b/data/transforms.py
index f1222e2b..90419ae0 100644
--- a/data/transforms.py
+++ b/data/transforms.py
@@ -20,7 +20,7 @@ def get_model_meanstd(model_name):
     model_name = model_name.lower()
     if 'dpn' in model_name:
         return IMAGENET_DPN_MEAN, IMAGENET_DPN_STD
-    elif 'ception' in model_name:
+    elif 'ception' in model_name or 'nasnet' in model_name:
         return IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
     else:
         return IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
@@ -30,7 +30,7 @@ def get_model_mean(model_name):
     model_name = model_name.lower()
     if 'dpn' in model_name:
         return IMAGENET_DPN_STD
-    elif 'ception' in model_name:
+    elif 'ception' in model_name or 'nasnet' in model_name:
         return IMAGENET_INCEPTION_MEAN
     else:
         return IMAGENET_DEFAULT_MEAN
@@ -40,7 +40,7 @@ def get_model_std(model_name):
     model_name = model_name.lower()
     if 'dpn' in model_name:
         return IMAGENET_DEFAULT_STD
-    elif 'ception' in model_name:
+    elif 'ception' in model_name or 'nasnet' in model_name:
         return IMAGENET_INCEPTION_STD
     else:
         return IMAGENET_DEFAULT_STD
diff --git a/models/model_factory.py b/models/model_factory.py
index 68d6bd6d..c2a47fed 100644
--- a/models/model_factory.py
+++ b/models/model_factory.py
@@ -11,6 +11,7 @@ from .senet import seresnet18, seresnet34, seresnet50, seresnet101, seresnet152,
     seresnext26_32x4d, seresnext50_32x4d, seresnext101_32x4d
 from .resnext import resnext50, resnext101, resnext152
 from .xception import xception
+from .pnasnet import pnasnet5large
 
 model_config_dict = {
     'resnet18': {
@@ -47,6 +48,8 @@ model_config_dict = {
         'model_name': 'inception_resnet_v2', 'num_classes': 1000, 'input_size': 299, 'normalizer': 'le'},
     'xception': {
         'model_name': 'xception', 'num_classes': 1000, 'input_size': 299, 'normalizer': 'le'},
+    'pnasnet5large': {
+        'model_name': 'pnasnet5large', 'num_classes': 1000, 'input_size': 331, 'normalizer': 'le'}
 }
 
 
@@ -125,6 +128,8 @@ def create_model(
         model = resnext152(num_classes=num_classes, pretrained=pretrained, **kwargs)
     elif model_name == 'xception':
         model = xception(num_classes=num_classes, pretrained=pretrained)
+    elif model_name == 'pnasnet5large':
+        model = pnasnet5large(num_classes=num_classes, pretrained=pretrained)
     else:
         assert False and "Invalid model"
 
diff --git a/models/pnasnet.py b/models/pnasnet.py
index c169c695..6aebb772 100644
--- a/models/pnasnet.py
+++ b/models/pnasnet.py
@@ -5,7 +5,6 @@ import torch
 import torch.nn as nn
 import torch.utils.model_zoo as model_zoo
 
-
 pretrained_settings = {
     'pnasnet5large': {
         'imagenet': {
@@ -292,6 +291,8 @@ class PNASNet5Large(nn.Module):
     def __init__(self, num_classes=1001):
         super(PNASNet5Large, self).__init__()
         self.num_classes = num_classes
+        self.num_features = 4320
+
         self.conv_0 = nn.Sequential(OrderedDict([
             ('conv', nn.Conv2d(3, 96, kernel_size=3, stride=2, bias=False)),
             ('bn', nn.BatchNorm2d(96, eps=0.001))
@@ -335,9 +336,20 @@ class PNASNet5Large(nn.Module):
         self.relu = nn.ReLU()
         self.avg_pool = nn.AvgPool2d(11, stride=1, padding=0)
         self.dropout = nn.Dropout(0.5)
-        self.last_linear = nn.Linear(4320, num_classes)
+        self.last_linear = nn.Linear(self.num_features, num_classes)
+
+    def get_classifier(self):
+        return self.last_linear
+
+    def reset_classifier(self, num_classes):
+        self.num_classes = num_classes
+        del self.last_linear
+        if num_classes:
+            self.last_linear = nn.Linear(self.num_features, num_classes)
+        else:
+            self.last_linear = None
 
-    def features(self, x):
+    def forward_features(self, x, pool=True):
         x_conv_0 = self.conv_0(x)
         x_stem_0 = self.cell_stem_0(x_conv_0)
         x_stem_1 = self.cell_stem_1(x_conv_0, x_stem_0)
@@ -353,19 +365,16 @@ class PNASNet5Large(nn.Module):
         x_cell_9 = self.cell_9(x_cell_7, x_cell_8)
         x_cell_10 = self.cell_10(x_cell_8, x_cell_9)
         x_cell_11 = self.cell_11(x_cell_9, x_cell_10)
-        return x_cell_11
-
-    def logits(self, features):
-        x = self.relu(features)
-        x = self.avg_pool(x)
-        x = x.view(x.size(0), -1)
-        x = self.dropout(x)
-        x = self.last_linear(x)
+        x = self.relu(x_cell_11)
+        if pool:
+            x = self.avg_pool(x)
+            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, input):
-        x = self.features(input)
-        x = self.logits(x)
+        x = self.forward_features(input)
+        x = self.dropout(x)
+        x = self.last_linear(x)
         return x
 
 
@@ -375,7 +384,7 @@ def pnasnet5large(num_classes=1001, pretrained='imagenet'):
     <https://arxiv.org/abs/1712.00559>`_ paper.
     """
     if pretrained:
-        settings = pretrained_settings['pnasnet5large'][pretrained]
+        settings = pretrained_settings['pnasnet5large']['imagenet']
         assert num_classes == settings[
             'num_classes'], 'num_classes should be {}, but is {}'.format(
             settings['num_classes'], num_classes)
@@ -384,18 +393,12 @@ def pnasnet5large(num_classes=1001, pretrained='imagenet'):
         model = PNASNet5Large(num_classes=1001)
         model.load_state_dict(model_zoo.load_url(settings['url']))
 
-        if pretrained == 'imagenet':
-            new_last_linear = nn.Linear(model.last_linear.in_features, 1000)
-            new_last_linear.weight.data = model.last_linear.weight.data[1:]
-            new_last_linear.bias.data = model.last_linear.bias.data[1:]
-            model.last_linear = new_last_linear
-
-        model.input_space = settings['input_space']
-        model.input_size = settings['input_size']
-        model.input_range = settings['input_range']
+        #if pretrained == 'imagenet':
+        new_last_linear = nn.Linear(model.last_linear.in_features, 1000)
+        new_last_linear.weight.data = model.last_linear.weight.data[1:]
+        new_last_linear.bias.data = model.last_linear.bias.data[1:]
+        model.last_linear = new_last_linear
 
-        model.mean = settings['mean']
-        model.std = settings['std']
     else:
         model = PNASNet5Large(num_classes=num_classes)
     return model
diff --git a/models/resnext.py b/models/resnext.py
index aafcd93b..57cb79f8 100644
--- a/models/resnext.py
+++ b/models/resnext.py
@@ -142,7 +142,6 @@ def resnext50(cardinality=32, base_width=4, pretrained=False, **kwargs):
     Args:
         cardinality (int): Cardinality of the aggregated transform
         base_width (int): Base width of the grouped convolution
-        shortcut ('A'|'B'|'C'): 'B' use 1x1 conv to downsample, 'C' use 1x1 conv on every residual connection
     """
     model = ResNeXt(
         ResNeXtBottleneckC, [3, 4, 6, 3], cardinality=cardinality, base_width=base_width, **kwargs)
@@ -155,7 +154,6 @@ def resnext101(cardinality=32, base_width=4, pretrained=False, **kwargs):
     Args:
         cardinality (int): Cardinality of the aggregated transform
         base_width (int): Base width of the grouped convolution
-        shortcut ('A'|'B'|'C'): 'B' use 1x1 conv to downsample, 'C' use 1x1 conv on every residual connection
     """
     model = ResNeXt(
         ResNeXtBottleneckC, [3, 4, 23, 3], cardinality=cardinality, base_width=base_width, **kwargs)
@@ -168,7 +166,6 @@ def resnext152(cardinality=32, base_width=4, pretrained=False, **kwargs):
     Args:
         cardinality (int): Cardinality of the aggregated transform
         base_width (int): Base width of the grouped convolution
-        shortcut ('A'|'B'|'C'): 'B' use 1x1 conv to downsample, 'C' use 1x1 conv on every residual connection
     """
     model = ResNeXt(
         ResNeXtBottleneckC, [3, 8, 36, 3], cardinality=cardinality, base_width=base_width, **kwargs)
diff --git a/models/xception.py b/models/xception.py
index c4ae09fa..97b3947d 100644
--- a/models/xception.py
+++ b/models/xception.py
@@ -127,6 +127,7 @@ class Xception(nn.Module):
         """
         super(Xception, self).__init__()
         self.num_classes = num_classes
+        self.num_features = 2048
 
         self.conv1 = nn.Conv2d(3, 32, 3, 2, 0, bias=False)
         self.bn1 = nn.BatchNorm2d(32)
@@ -156,10 +157,10 @@ class Xception(nn.Module):
         self.bn3 = nn.BatchNorm2d(1536)
 
         # do relu here
-        self.conv4 = SeparableConv2d(1536, 2048, 3, 1, 1)
-        self.bn4 = nn.BatchNorm2d(2048)
+        self.conv4 = SeparableConv2d(1536, self.num_features, 3, 1, 1)
+        self.bn4 = nn.BatchNorm2d(self.num_features)
 
-        self.fc = nn.Linear(2048, num_classes)
+        self.fc = nn.Linear(self.num_features, num_classes)
 
         # #------- init weights --------
         for m in self.modules():
@@ -169,7 +170,18 @@ class Xception(nn.Module):
                 m.weight.data.fill_(1)
                 m.bias.data.zero_()
 
-    def forward_features(self, input):
+    def get_classifier(self):
+        return self.fc
+
+    def reset_classifier(self, num_classes):
+        self.num_classes = num_classes
+        del self.fc
+        if num_classes:
+            self.fc = nn.Linear(self.num_features, num_classes)
+        else:
+            self.fc = None
+
+    def forward_features(self, input, pool=True):
         x = self.conv1(input)
         x = self.bn1(x)
         x = self.relu(x)
@@ -197,19 +209,16 @@ class Xception(nn.Module):
 
         x = self.conv4(x)
         x = self.bn4(x)
-        return x
-
-    def logits(self, features):
-        x = self.relu(features)
+        x = self.relu(x)
 
-        x = F.adaptive_avg_pool2d(x, (1, 1))
-        x = x.view(x.size(0), -1)
-        x = self.last_linear(x)
+        if pool:
+            x = F.adaptive_avg_pool2d(x, (1, 1))
+            x = x.view(x.size(0), -1)
         return x
 
     def forward(self, input):
         x = self.forward_features(input)
-        x = self.logits(x)
+        x = self.fc(x)
         return x
 
 
@@ -223,13 +232,4 @@ def xception(num_classes=1000, pretrained=False):
         model = Xception(num_classes=num_classes)
         model.load_state_dict(model_zoo.load_url(config['url']))
 
-        model.input_space = config['input_space']
-        model.input_size = config['input_size']
-        model.input_range = config['input_range']
-        model.mean = config['mean']
-        model.std = config['std']
-
-    # TODO: ugly
-    model.last_linear = model.fc
-    del model.fc
     return model

From f1cd1a5ce3e1e9dde77f9cb5d024f6e3268da1b9 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Sun, 7 Apr 2019 10:22:55 -0700
Subject: [PATCH 2/2] Cleanup CheckpointSaver, add support for increasing or
 decreasing metric, switch to prec1 metric in train loop

---
 train.py | 24 +++++++++++++++---------
 utils.py | 54 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 28 deletions(-)

diff --git a/train.py b/train.py
index 89632d39..5494b3ff 100644
--- a/train.py
+++ b/train.py
@@ -93,6 +93,8 @@ parser.add_argument('--amp', action='store_true', default=False,
                     help='use NVIDIA amp for mixed precision training')
 parser.add_argument('--output', default='', type=str, metavar='PATH',
                     help='path to output folder (default: none, current dir)')
+parser.add_argument('--eval-metric', default='prec1', type=str, metavar='EVAL_METRIC',
+                    help='Best metric (default: "prec1"')
 parser.add_argument("--local_rank", default=0, type=int)
 
 
@@ -238,10 +240,13 @@ def main():
     if args.local_rank == 0:
         print('Scheduled epochs: ', num_epochs)
 
+    eval_metric = args.eval_metric
     saver = None
     if output_dir:
-        saver = CheckpointSaver(checkpoint_dir=output_dir)
-    best_loss = None
+        decreasing = True if eval_metric == 'loss' else False
+        saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing)
+    best_metric = None
+    best_epoch = None
     try:
         for epoch in range(start_epoch, num_epochs):
             if args.distributed:
@@ -255,15 +260,15 @@ def main():
                 model, loader_eval, validate_loss_fn, args)
 
             if lr_scheduler is not None:
-                lr_scheduler.step(epoch, eval_metrics['eval_loss'])
+                lr_scheduler.step(epoch, eval_metrics[eval_metric])
 
             update_summary(
                 epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'),
-                write_header=best_loss is None)
+                write_header=best_metric is None)
 
             if saver is not None:
                 # save proper checkpoint with eval metric
-                best_loss = saver.save_checkpoint({
+                best_metric, best_epoch = saver.save_checkpoint({
                     'epoch': epoch + 1,
                     'arch': args.model,
                     'state_dict': model.state_dict(),
@@ -271,11 +276,12 @@ def main():
                     'args': args,
                     },
                     epoch=epoch + 1,
-                    metric=eval_metrics['eval_loss'])
+                    metric=eval_metrics[eval_metric])
 
     except KeyboardInterrupt:
         pass
-    print('*** Best loss: {0} (epoch {1})'.format(best_loss[1], best_loss[0]))
+    if best_metric is not None:
+        print('*** Best metric: {0} (epoch {1})'.format(best_metric, best_epoch))
 
 
 def train_epoch(
@@ -363,7 +369,7 @@ def train_epoch(
 
         end = time.time()
 
-    return OrderedDict([('train_loss', losses_m.avg)])
+    return OrderedDict([('loss', losses_m.avg)])
 
 
 def validate(model, loader, loss_fn, args):
@@ -418,7 +424,7 @@ def validate(model, loader, loss_fn, args):
                     batch_time=batch_time_m, loss=losses_m,
                     top1=prec1_m, top5=prec5_m))
 
-    metrics = OrderedDict([('eval_loss', losses_m.avg), ('eval_prec1', prec1_m.avg)])
+    metrics = OrderedDict([('loss', losses_m.avg), ('prec1', prec1_m.avg), ('prec5', prec5_m.avg)])
 
     return metrics
 
diff --git a/utils.py b/utils.py
index 4604a258..f206f945 100644
--- a/utils.py
+++ b/utils.py
@@ -6,6 +6,7 @@ import os
 import shutil
 import glob
 import csv
+import operator
 from collections import OrderedDict
 
 
@@ -16,24 +17,32 @@ class CheckpointSaver:
             recovery_prefix='recovery',
             checkpoint_dir='',
             recovery_dir='',
+            decreasing=False,
+            verbose=True,
             max_history=10):
 
-        self.checkpoint_files = []
+        # state
+        self.checkpoint_files = []  # (filename, metric) tuples in order of decreasing betterness
+        self.best_epoch = None
         self.best_metric = None
-        self.worst_metric = None
-        self.max_history = max_history
-        assert self.max_history >= 1
         self.curr_recovery_file = ''
         self.last_recovery_file = ''
+
+        # config
         self.checkpoint_dir = checkpoint_dir
         self.recovery_dir = recovery_dir
         self.save_prefix = checkpoint_prefix
         self.recovery_prefix = recovery_prefix
         self.extension = '.pth.tar'
+        self.decreasing = decreasing  # a lower metric is better if True
+        self.cmp = operator.lt if decreasing else operator.gt  # True if lhs better than rhs
+        self.verbose = verbose
+        self.max_history = max_history
+        assert self.max_history >= 1
 
     def save_checkpoint(self, state, epoch, metric=None):
-        worst_metric = self.checkpoint_files[-1] if self.checkpoint_files else None
-        if len(self.checkpoint_files) < self.max_history or metric < worst_metric[1]:
+        worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None
+        if len(self.checkpoint_files) < self.max_history or self.cmp(metric, worst_file[1]):
             if len(self.checkpoint_files) >= self.max_history:
                 self._cleanup_checkpoints(1)
 
@@ -43,16 +52,21 @@ class CheckpointSaver:
                 state['metric'] = metric
             torch.save(state, save_path)
             self.checkpoint_files.append((save_path, metric))
-            self.checkpoint_files = sorted(self.checkpoint_files, key=lambda x: x[1])
-
-            print("Current checkpoints:")
-            for c in self.checkpoint_files:
-                print(c)
-
-            if metric is not None and (self.best_metric is None or metric < self.best_metric[1]):
-                self.best_metric = (epoch, metric)
+            self.checkpoint_files = sorted(
+                self.checkpoint_files, key=lambda x: x[1],
+                reverse=not self.decreasing)  # sort in descending order if a lower metric is not better
+
+            if self.verbose:
+                print("Current checkpoints:")
+                for c in self.checkpoint_files:
+                    print(c)
+
+            if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)):
+                self.best_epoch = epoch
+                self.best_metric = metric
                 shutil.copyfile(save_path, os.path.join(self.checkpoint_dir, 'model_best' + self.extension))
-        return None, None if self.best_metric is None else self.best_metric
+
+        return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch)
 
     def _cleanup_checkpoints(self, trim=0):
         trim = min(len(self.checkpoint_files), trim)
@@ -62,7 +76,8 @@ class CheckpointSaver:
         to_delete = self.checkpoint_files[delete_index:]
         for d in to_delete:
             try:
-                print('Cleaning checkpoint: ', d)
+                if self.verbose:
+                    print('Cleaning checkpoint: ', d)
                 os.remove(d[0])
             except Exception as e:
                 print('Exception (%s) while deleting checkpoint' % str(e))
@@ -74,7 +89,8 @@ class CheckpointSaver:
         torch.save(state, save_path)
         if os.path.exists(self.last_recovery_file):
             try:
-                print('Cleaning recovery', self.last_recovery_file)
+                if self.verbose:
+                    print('Cleaning recovery', self.last_recovery_file)
                 os.remove(self.last_recovery_file)
             except Exception as e:
                 print("Exception (%s) while removing %s" % (str(e), self.last_recovery_file))
@@ -143,8 +159,8 @@ def get_outdir(path, *paths, inc=False):
 
 def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False):
     rowd = OrderedDict(epoch=epoch)
-    rowd.update(train_metrics)
-    rowd.update(eval_metrics)
+    rowd.update([('train_' + k, v) for k, v in train_metrics.items()])
+    rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()])
     with open(filename, mode='a') as cf:
         dw = csv.DictWriter(cf, fieldnames=rowd.keys())
         if write_header:  # first iteration (epoch == 1 can't be used)