From 66253790d42e41064be9e53421e8b91dccbc890f Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 19 Oct 2021 16:06:38 -0700
Subject: [PATCH] Add `--bench profile` mode for benchmark.py to just run
 deepspeed detailed profile on model

---
 benchmark.py | 47 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/benchmark.py b/benchmark.py
index 98f2ef84..61bae0d4 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -147,19 +147,19 @@ def resolve_precision(precision: str):
     return use_amp, model_dtype, data_dtype
 
 
-def profile(model, input_size=(3, 224, 224)):
+def profile(model, input_size=(3, 224, 224), detailed=False):
     batch_size = 1
     macs, params = get_model_profile(
         model=model,
         input_res=(batch_size,) + input_size,  # input shape or input to the input_constructor
         input_constructor=None,  # if specified, a constructor taking input_res is used as input to the model
-        print_profile=False,  # prints the model graph with the measured profile attached to each module
-        detailed=False,  # print the detailed profile
+        print_profile=detailed,  # prints the model graph with the measured profile attached to each module
+        detailed=detailed,  # print the detailed profile
         warm_up=10,  # the number of warm-ups before measuring the time of each module
         as_string=False,  # print raw numbers (e.g. 1000) or as human-readable strings (e.g. 1k)
         output_file=None,  # path to the output file. If None, the profiler prints to stdout.
         ignore_modules=None)  # the list of modules to ignore in the profiling
-    return macs
+    return macs, params
 
 
 class BenchmarkRunner:
@@ -258,8 +258,8 @@ class InferenceBenchmarkRunner(BenchmarkRunner):
         )
 
         if get_model_profile is not None:
-            macs = profile(self.model, self.input_size)
-            results['GMACs'] = round(macs / 1e9, 2)
+            macs, _ = profile(self.model, self.input_size)
+            results['gmacs'] = round(macs / 1e9, 2)
 
         _logger.info(
             f"Inference benchmark of {self.model_name} done. "
@@ -388,6 +388,32 @@ class TrainBenchmarkRunner(BenchmarkRunner):
         return results
 
 
+class ProfileRunner(BenchmarkRunner):
+
+    def __init__(self, model_name, device='cuda', **kwargs):
+        super().__init__(model_name=model_name, device=device, **kwargs)
+        self.model.eval()
+
+    def run(self):
+        _logger.info(
+            f'Running profiler on {self.model_name} w/ '
+            f'input size {self.input_size} and batch size 1.')
+
+        macs, params = profile(self.model, self.input_size, detailed=True)
+
+        results = dict(
+            gmacs=round(macs / 1e9, 2),
+            img_size=self.input_size[-1],
+            param_count=round(params / 1e6, 2),
+        )
+
+        _logger.info(
+            f"Profile of {self.model_name} done. "
+            f"{results['gmacs']:.2f} GMACs, {results['param_count']:.2f} M params.")
+
+        return results
+
+
 def decay_batch_exp(batch_size, factor=0.5, divisor=16):
     out_batch_size = batch_size * factor
     if out_batch_size > divisor:
@@ -436,6 +462,9 @@ def benchmark(args):
     elif args.bench == 'train':
         bench_fns = TrainBenchmarkRunner,
         prefixes = 'train',
+    elif args.bench == 'profile':
+        assert get_model_profile is not None, "deepspeed needs to be installed for profile"
+        bench_fns = ProfileRunner,
 
     model_results = OrderedDict(model=model)
     for prefix, bench_fn in zip(prefixes, bench_fns):
@@ -483,7 +512,11 @@ def main():
                 results.append(r)
         except KeyboardInterrupt as e:
             pass
-        sort_key = 'train_samples_per_sec' if 'train' in args.bench else 'infer_samples_per_sec'
+        sort_key = 'infer_samples_per_sec'
+        if 'train' in args.bench:
+            sort_key = 'train_samples_per_sec'
+        elif 'profile' in args.bench:
+            sort_key = 'infer_gmacs'
         results = sorted(results, key=lambda x: x[sort_key], reverse=True)
         if len(results):
             write_results(results_file, results)