diff --git a/post_quantization_validation.py b/post_quantization_validate.py old mode 100644 new mode 100755 similarity index 57% rename from post_quantization_validation.py rename to post_quantization_validate.py index 6ad3ce2f..73c09dd4 --- a/post_quantization_validation.py +++ b/post_quantization_validate.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ ImageNet Validation Script This is intended to be a lean and easily modifiable ImageNet validation script for evaluating pretrained @@ -17,18 +17,29 @@ import torch import torch.nn as nn import torch.nn.parallel from collections import OrderedDict -import torch.quantization +from contextlib import suppress -try: - from apex import amp - has_apex = True -except ImportError: - has_apex = False +import torch.quantization +#currently, quantization only runs on CPUs +os.environ['CUDA_VISIBLE_DEVICES'] = "" from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models -from timm.data import Dataset, DatasetTar, resolve_data_config, RealLabelsImagenet -from timm.data.quant_loader import create_loader -from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging +from timm.data import create_dataset, create_loader, resolve_data_config, RealLabelsImagenet +from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy + +#has_apex = False +#try: +# from apex import amp +# has_apex = True +#except ImportError: +# pass + +#as_native_amp = False +#try: +# if getattr(torch.cuda.amp, 'autocast') is not None: +# has_native_amp = True +#except AttributeError: +# pass torch.backends.cudnn.benchmark = True _logger = logging.getLogger('validate') @@ -37,18 +48,26 @@ _logger = logging.getLogger('validate') parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation') parser.add_argument('data', metavar='DIR', help='path to dataset') +parser.add_argument('--dataset', '-d', metavar='NAME', default='', + help='dataset type (default: ImageFolder/ImageTar if empty)') +#argument for calibration dataset parser.add_argument('--calib-data', metavar='DIR', - help='path to calibration dataset') -parser.add_argument('--model', '-m', metavar='MODEL', default='dpn92', + help='path to calibration dataset') +# quantization option(weight only, dynamic, static) +parser.add_argument('--quant_option', metavar='NAME', default='static', + help='quantization option (weight_only, dynamic, static) (default: static)') +parser.add_argument('--split', metavar='NAME', default='validation', + help='dataset split (default: validation)') +parser.add_argument('--model', '-m', metavar='NAME', default='dpn92', help='model architecture (default: dpn92)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 2)') parser.add_argument('-b', '--batch-size', default=256, type=int, metavar='N', help='mini-batch size (default: 256)') -parser.add_argument('--calib-iter', default=100, type=int, - metavar='N', help='Train set iterations for calibration before quantization') parser.add_argument('--img-size', default=None, type=int, metavar='N', help='Input image dimension, uses model default if empty') +parser.add_argument('--input-size', default=None, nargs=3, type=int, + metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty') parser.add_argument('--crop-pct', default=None, type=float, metavar='N', help='Input image center crop pct') parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', @@ -57,26 +76,37 @@ parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD' help='Override std deviation of of dataset') parser.add_argument('--interpolation', default='', type=str, metavar='NAME', help='Image resize interpolation type (overrides model)') -parser.add_argument('--num-classes', type=int, default=1000, +parser.add_argument('--num-classes', type=int, default=None, help='Number classes in dataset') parser.add_argument('--class-map', default='', type=str, metavar='FILENAME', help='path to class to idx mapping file (default: "")') +parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') parser.add_argument('--log-freq', default=10, type=int, metavar='N', help='batch logging frequency (default: 10)') parser.add_argument('--checkpoint', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model') -parser.add_argument('--num-gpu', type=int, default=1, - help='Number of GPUS to use') +#parser.add_argument('--num-gpu', type=int, default=1, +# help='Number of GPUS to use') +#num-gpu is set to zero(no gpu usage) +parser.add_argument('--num-gpu', type=int, default=0, + help='Number of GPUS to use') parser.add_argument('--no-test-pool', dest='no_test_pool', action='store_true', help='disable test time pool') parser.add_argument('--no-prefetcher', action='store_true', default=False, help='disable fast prefetcher') parser.add_argument('--pin-mem', action='store_true', default=False, help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') -parser.add_argument('--amp', action='store_true', default=False, - help='Use AMP mixed precision') +parser.add_argument('--channels-last', action='store_true', default=False, + help='Use channels_last memory layout') +#parser.add_argument('--amp', action='store_true', default=False, +# help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.') +#parser.add_argument('--apex-amp', action='store_true', default=False, +# help='Use NVIDIA Apex AMP mixed precision') +#parser.add_argument('--native-amp', action='store_true', default=False, +# help='Use Native Torch AMP mixed precision') parser.add_argument('--tf-preprocessing', action='store_true', default=False, help='Use Tensorflow preprocessing pipeline (require CPU TF installed') parser.add_argument('--use-ema', dest='use_ema', action='store_true', @@ -93,24 +123,28 @@ parser.add_argument('--valid-labels', default='', type=str, metavar='FILENAME', help='Valid label indices txt file for validation of partial label space') -def set_jit_legacy(): - """ Set JIT executor to legacy w/ support for op fusion - This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes - in the JIT exectutor. These API are not supported so could change. - """ - # - assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!" - torch._C._jit_set_profiling_executor(False) - torch._C._jit_set_profiling_mode(False) - torch._C._jit_override_can_fuse_on_gpu(True) - #torch._C._jit_set_texpr_fuser_enabled(True) - - def validate(args): # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher +# amp_autocast = suppress # do nothing +# if args.amp: +# if has_native_amp: +# args.native_amp = True +# elif has_apex: +# args.apex_amp = True +# else: +# _logger.warning("Neither APEX or Native Torch AMP is available.") +# assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set." +# if args.native_amp: +# amp_autocast = torch.cuda.amp.autocast +# _logger.info('Validating in mixed precision with native PyTorch AMP.') +# elif args.apex_amp: +# _logger.info('Validating in mixed precision with NVIDIA APEX AMP.') +# else: +# _logger.info('Validating in float32. AMP not enabled.') + if args.legacy_jit: set_jit_legacy() @@ -120,30 +154,50 @@ def validate(args): pretrained=args.pretrained, num_classes=args.num_classes, in_chans=3, + global_pool=args.gp, scriptable=args.torchscript) + if args.num_classes is None: + assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.' + args.num_classes = model.num_classes if args.checkpoint: - load_checkpoint(model, args.checkpoint, args.use_ema) + load_checkpoint(model, args.checkpoint, args.use_ema) param_count = sum([m.numel() for m in model.parameters()]) _logger.info('Model %s created, param count: %d' % (args.model, param_count)) - data_config = resolve_data_config(vars(args), model=model) - model, test_time_pool = apply_test_time_pool(model, data_config, args) + data_config = resolve_data_config(vars(args), model=model, use_test_size=True) + test_time_pool = False + if not args.no_test_pool: + model, test_time_pool = apply_test_time_pool(model, data_config, use_test_size=True) if args.torchscript: torch.jit.optimized_execution(True) model = torch.jit.script(model) +# model = model.cuda() +# if args.apex_amp: +# model = amp.initialize(model, opt_level='O1') + + if args.channels_last: + model = model.to(memory_format=torch.channels_last) +# if args.num_gpu > 1: +# model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu))) + +# criterion = nn.CrossEntropyLoss().cuda() criterion = nn.CrossEntropyLoss() + dataset = create_dataset( + root=args.data, name=args.dataset, split=args.split, + load_bytes=args.tf_preprocessing, class_map=args.class_map) + + # added for post quantization calibration + + calib_dataset = create_dataset( + root=args.data, name=args.dataset, split=args.split, + load_bytes=args.tf_preprocessing, class_map=args.class_map) + - if os.path.splitext(args.data)[1] == '.tar' and os.path.isfile(args.data): - dataset = DatasetTar(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map) - calib_dataset = DatasetTar(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map) - else: - dataset = Dataset(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map) - calib_dataset = Dataset(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map) if args.valid_labels: with open(args.valid_labels, 'r') as f: valid_labels = {int(line.rstrip()) for line in f} @@ -169,6 +223,8 @@ def validate(args): crop_pct=crop_pct, pin_memory=args.pin_mem, tf_preprocessing=args.tf_preprocessing) + + #Also create loader for calibration dataset calib_loader = create_loader( calib_dataset, input_size=data_config['input_size'], @@ -180,94 +236,104 @@ def validate(args): num_workers=args.workers, crop_pct=crop_pct, pin_memory=args.pin_mem, - tf_preprocessing=args.tf_preprocessing) + tf_preprocessing=args.tf_preprocessing) + batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() - print('Start calibration of quantization observers before post-quantization') - model.eval() - model.fuse_model() - model.qconfig = torch.quantization.get_default_qconfig('fbgemm') - print(model.qconfig) - torch.quantization.prepare(model, inplace=True) - - with torch.no_grad(): - # warmup, reduce variability of first batch time, especially for comparing torchscript vs non - input = torch.randn((args.batch_size,) + data_config['input_size']) - model(input) - end = time.time() - for batch_idx, (input, target) in enumerate(calib_loader): - if batch_idx > args.calib_iter: - break - if args.no_prefetcher: - if args.fp16: - input = input.half() - # compute output - output = model(input) - if valid_labels is not None: - output = output[:, valid_labels] - loss = criterion(output, target) - - if real_labels is not None: - real_labels.add_result(output) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output.data, target, topk=(1, 5)) - losses.update(loss.item(), input.size(0)) - top1.update(acc1.item(), input.size(0)) - top5.update(acc5.item(), input.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) + print('Start calibration of quantization observers before post-quantization') + model_to_quantize = copy.deepcopy(model) + model_to_quantize.eval() + + #post training static quantization + if args.quant_option == 'static': + qconfig_dict = {"": torch.quantization.default_static_qconfig} + model_to_quantize = copy.deepcopy(model_fp) + qconfig_dict = {"": torch.quantization.get_default_qconfig('qnnpack')} + model_to_quantize.eval() + # prepare + model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict) + # calibrate + with torch.no_grad(): + # warmup, reduce variability of first batch time, especially for comparing torchscript vs non + input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + model(input) end = time.time() - - if batch_idx % args.log_freq == 0: - _logger.info( - 'Test: [{0:>4d}/{1}] ' - 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' - 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' - 'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f}) ' - 'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format( - batch_idx, args.calib_iter, batch_time=batch_time, - rate_avg=input.size(0) / batch_time.avg, - loss=losses, top1=top1, top5=top5)) - - if real_labels is not None: - # real labels mode replaces topk values at the end - top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5) + for batch_idx, (input, target) in enumerate(loader): + + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) + + if valid_labels is not None: + output = output[:, valid_labels] + loss = criterion(output, target) + + if real_labels is not None: + real_labels.add_result(output) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5)) + losses.update(loss.item(), input.size(0)) + top1.update(acc1.item(), input.size(0)) + top5.update(acc5.item(), input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if batch_idx % args.log_freq == 0: + _logger.info( + 'Test: [{0:>4d}/{1}] ' + 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' + 'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f}) ' + 'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f}) ' + 'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format( + batch_idx, len(loader), batch_time=batch_time, + rate_avg=input.size(0) / batch_time.avg, + loss=losses, top1=top1, top5=top5)) + # quantize + model_quantized = quantize_fx.convert_fx(model_prepared) + #post training dynamic/weight only quantization + elif args.quant_option == 'dynamic': + qconfig_dict = {"": torch.quantization.default_dynamic_qconfig} + # prepare + model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict) + # no calibration needed when we only have dynamici/weight_only quantization + # quantize + model_quantized = quantize_fx.convert_fx(model_prepared) else: - top1a, top5a = top1.avg, top5.avg - results = OrderedDict( - top1=round(top1a, 4), top1_err=round(100 - top1a, 4), - top5=round(top5a, 4), top5_err=round(100 - top5a, 4), - param_count=round(param_count / 1e6, 2), - img_size=data_config['input_size'][-1], - cropt_pct=crop_pct, - interpolation=data_config['interpolation']) + _logger.warning("Invalid quantization option. Set option to default(static)") + # + # fusion + # + model_to_quantize = copy.deepcopy(model_fp) + model_fused = quantize_fx.fuse_fx(model_to_quantize) + + model = model_fused - _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format( - results['top1'], results['top1_err'], results['top5'], results['top5_err'])) - print('Start validation of post-quantized model') - torch.quantization.convert(model.eval(),inplace = True) - - batch_time = AverageMeter() - losses = AverageMeter() - top1 = AverageMeter() - top5 = AverageMeter() with torch.no_grad(): # warmup, reduce variability of first batch time, especially for comparing torchscript vs non - input = torch.randn((args.batch_size,) + data_config['input_size']) +# input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])).cuda() + input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])) + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) model(input) end = time.time() for batch_idx, (input, target) in enumerate(loader): - if args.no_prefetcher: - if args.fp16: - input = input.half() + # if args.no_prefetcher: + # target = target.cuda() + # input = input.cuda() + if args.channels_last: + input = input.contiguous(memory_format=torch.channels_last) # compute output - output = model(input) + # with amp_autocast(): + # output = model(input) + if valid_labels is not None: output = output[:, valid_labels] loss = criterion(output, target) @@ -276,7 +342,7 @@ def validate(args): real_labels.add_result(output) # measure accuracy and record loss - acc1, acc5 = accuracy(output.data, target, topk=(1, 5)) + acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5)) losses.update(loss.item(), input.size(0)) top1.update(acc1.item(), input.size(0)) top5.update(acc5.item(), input.size(0)) @@ -310,7 +376,8 @@ def validate(args): interpolation=data_config['interpolation']) _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format( - results['top1'], results['top1_err'], results['top5'], results['top5_err'])) + results['top1'], results['top1_err'], results['top5'], results['top5_err'])) + return results @@ -329,7 +396,7 @@ def main(): if args.model == 'all': # validate all models in a list of names with pretrained checkpoints args.pretrained = True - model_names = list_models(pretrained=True) + model_names = list_models(pretrained=True, exclude_filters=['*in21k']) model_cfgs = [(n, '') for n in model_names] elif not is_model(args.model): # model name doesn't exist, try as wildcard filter @@ -349,7 +416,8 @@ def main(): result = OrderedDict(model=args.model) r = {} while not r and batch_size >= args.num_gpu: - torch.cuda.empty_cache() +# torch.cuda.empty_cache() + torch.empty_cache() try: args.batch_size = batch_size print('Validating with batch size: %d' % args.batch_size)