diff --git a/benchmark.py b/benchmark.py index 5e07e5fe..01f1b2ea 100755 --- a/benchmark.py +++ b/benchmark.py @@ -473,6 +473,7 @@ def decay_batch_exp(batch_size, factor=0.5, divisor=16): def _try_run(model_name, bench_fn, initial_batch_size, bench_kwargs): batch_size = initial_batch_size results = dict() + error_str = 'Unknown' while batch_size >= 1: torch.cuda.empty_cache() try: @@ -480,13 +481,13 @@ def _try_run(model_name, bench_fn, initial_batch_size, bench_kwargs): results = bench.run() return results except RuntimeError as e: - e_str = str(e) - print(e_str) - if 'channels_last' in e_str: - print(f'Error: {model_name} not supported in channels_last, skipping.') + error_str = str(e) + if 'channels_last' in error_str: + _logger.error(f'{model_name} not supported in channels_last, skipping.') break - print(f'Error: "{e_str}" while running benchmark. Reducing batch size to {batch_size} for retry.') + _logger.warning(f'"{error_str}" while running benchmark. Reducing batch size to {batch_size} for retry.') batch_size = decay_batch_exp(batch_size) + results['error'] = error_str return results @@ -528,13 +529,14 @@ def benchmark(args): model_results = OrderedDict(model=model) for prefix, bench_fn in zip(prefixes, bench_fns): run_results = _try_run(model, bench_fn, initial_batch_size=batch_size, bench_kwargs=bench_kwargs) - if prefix: + if prefix and 'error' not in run_results: run_results = {'_'.join([prefix, k]): v for k, v in run_results.items()} model_results.update(run_results) - param_count = model_results.pop('infer_param_count', model_results.pop('train_param_count', 0)) - model_results.setdefault('param_count', param_count) - model_results.pop('train_param_count', 0) - return model_results if model_results['param_count'] else dict() + if 'error' not in model_results: + param_count = model_results.pop('infer_param_count', model_results.pop('train_param_count', 0)) + model_results.setdefault('param_count', param_count) + model_results.pop('train_param_count', 0) + return model_results def main(): @@ -578,13 +580,15 @@ def main(): sort_key = 'train_samples_per_sec' elif 'profile' in args.bench: sort_key = 'infer_gmacs' + results = filter(lambda x: sort_key in x, results) results = sorted(results, key=lambda x: x[sort_key], reverse=True) if len(results): write_results(results_file, results) else: results = benchmark(args) - json_str = json.dumps(results, indent=4) - print(json_str) + + # output results in JSON to stdout w/ delimiter for runner script + print(f'--result\n{json.dumps(results, indent=4)}') def write_results(results_file, results): diff --git a/validate.py b/validate.py index bbb1e8dc..10446302 100755 --- a/validate.py +++ b/validate.py @@ -11,6 +11,7 @@ import argparse import os import csv import glob +import json import time import logging import torch @@ -263,6 +264,7 @@ def validate(args): else: top1a, top5a = top1.avg, top5.avg results = OrderedDict( + model=args.model, top1=round(top1a, 4), top1_err=round(100 - top1a, 4), top5=round(top5a, 4), top5_err=round(100 - top5a, 4), param_count=round(param_count / 1e6, 2), @@ -276,6 +278,27 @@ def validate(args): return results +def _try_run(args, initial_batch_size): + batch_size = initial_batch_size + results = OrderedDict() + error_str = 'Unknown' + while batch_size >= 1: + args.batch_size = batch_size + torch.cuda.empty_cache() + try: + results = validate(args) + return results + except RuntimeError as e: + error_str = str(e) + if 'channels_last' in error_str: + break + _logger.warning(f'"{error_str}" while running validation. Reducing batch size to {batch_size} for retry.') + batch_size = batch_size // 2 + results['error'] = error_str + _logger.error(f'{args.model} failed to validate ({error_str}).') + return results + + def main(): setup_default_logging() args = parser.parse_args() @@ -308,36 +331,25 @@ def main(): _logger.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names))) results = [] try: - start_batch_size = args.batch_size + initial_batch_size = args.batch_size for m, c in model_cfgs: - batch_size = start_batch_size args.model = m args.checkpoint = c - result = OrderedDict(model=args.model) - r = {} - while not r and batch_size >= args.num_gpu: - torch.cuda.empty_cache() - try: - args.batch_size = batch_size - print('Validating with batch size: %d' % args.batch_size) - r = validate(args) - except RuntimeError as e: - if batch_size <= args.num_gpu: - print("Validation failed with no ability to reduce batch size. Exiting.") - raise e - batch_size = max(batch_size // 2, args.num_gpu) - print("Validation failed, reducing batch size by 50%") - result.update(r) + r = _try_run(args, initial_batch_size) + if 'error' in r: + continue if args.checkpoint: - result['checkpoint'] = args.checkpoint - results.append(result) + r['checkpoint'] = args.checkpoint + results.append(r) except KeyboardInterrupt as e: pass results = sorted(results, key=lambda x: x['top1'], reverse=True) if len(results): write_results(results_file, results) else: - validate(args) + results = validate(args) + # output results in JSON to stdout w/ delimiter for runner script + print(f'--result\n{json.dumps(results, indent=4)}') def write_results(results_file, results):